From d7208f59105e079ba037aea05f9d0603ba7f779f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 24 Apr 2023 15:49:19 -0700 Subject: [PATCH] libct/cg/sd: use systemd version when generating dev props Commit 343951a22b58c38feb added a call to os.Stat for the device path when generating systemd device properties, to avoid systemd warning for non-existing devices. The idea was, since systemd uses stat(2) to look up device properties for a given path, it will fail anyway. In addition, this allowed to suppress a warning like this from systemd: > Couldn't stat device /dev/char/10:200 NOTE that this was done because: - systemd could not add the rule anyway; - runs puts its own set of rules on top of what systemd does. Apparently, the above change broke some setups, resulting in inability to use e.g. /dev/null inside a container. My guess is this is because in cgroup v2 we add a second eBPF program, which is not used if the first one (added by systemd) returns "access denied". Next, commit 3b9582895b8685 fixed that by adding a call to os.Stat for "/sys/"+path (meaning, if "/dev/char/10:200" does not exist, we retry with "/sys/dev/char/10:200", and if it exists, proceed with adding a device rule with the original (non-"/sys") path). How that second fix ever worked was a mystery, because the path we gave to systemd still doesn't exist. Well, I think now I know. Since systemd v240 (commit 74c48bf5a8005f20) device access rules specified as /dev/{block|char}/MM:mm are no longer looked up on the filesystem, instead, if possible, those are parsed from the string. So, we need to do different things, depending on systemd version: - for systemd >= v240, use the /dev/{char,block}/MM:mm as is, without doing stat() -- since systemd doesn't do stat() either; - for older version, check if the path exists, and skip passing it on to systemd otherwise. - the check for /sys/dev/{block,char}/MM:mm is not needed in either case. Pass the systemd version to the function that generates the rules, and fix it accordingly. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/devices/systemd.go | 32 ++++++++++--------------- libcontainer/cgroups/systemd/common.go | 6 ++--- libcontainer/cgroups/systemd/v1.go | 2 +- libcontainer/cgroups/systemd/v2.go | 2 +- 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/libcontainer/cgroups/devices/systemd.go b/libcontainer/cgroups/devices/systemd.go index eb5ecb064d1..b78d90eafb0 100644 --- a/libcontainer/cgroups/devices/systemd.go +++ b/libcontainer/cgroups/devices/systemd.go @@ -17,7 +17,7 @@ import ( // systemdProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func systemdProperties(r *configs.Resources) ([]systemdDbus.Property, error) { +func systemdProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) { if r.SkipDevices { return nil, nil } @@ -78,9 +78,10 @@ func systemdProperties(r *configs.Resources) ([]systemdDbus.Property, error) { // trickery to convert things: // // * Concrete rules with non-wildcard major/minor numbers have to use - // /dev/{block,char} paths. This is slightly odd because it means - // that we cannot add whitelist rules for devices that don't exist, - // but there's not too much we can do about that. + // /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses + // stat(2) on such paths to look up device properties, meaning we + // cannot add whitelist rules for devices that don't exist. Since v240, + // device properties are parsed from the path string. // // However, path globbing is not support for path-based rules so we // need to handle wildcards in some other manner. @@ -128,21 +129,14 @@ func systemdProperties(r *configs.Resources) ([]systemdDbus.Property, error) { case devices.CharDevice: entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) } - // systemd will issue a warning if the path we give here doesn't exist. - // Since all of this logic is best-effort anyway (we manually set these - // rules separately to systemd) we can safely skip entries that don't - // have a corresponding path. - if _, err := os.Stat(entry.Path); err != nil { - // Also check /sys/dev so that we don't depend on /dev/{block,char} - // being populated. (/dev/{block,char} is populated by udev, which - // isn't strictly required for systemd). Ironically, this happens most - // easily when starting containerd within a runc created container - // itself. - - // We don't bother with securejoin here because we create entry.Path - // right above here, so we know it's safe. - if _, err := os.Stat("/sys" + entry.Path); err != nil { - logrus.Warnf("skipping device %s for systemd: %s", entry.Path, err) + if sdVer < 240 { + // Old systemd versions use stat(2) on path to find out device major:minor + // numbers and type. If the path doesn't exist, it will not add the rule, + // emitting a warning instead. + // Since all of this logic is best-effort anyway (we manually set these + // rules separately to systemd) we can safely skip entries that don't + // have a corresponding path. + if _, err := os.Stat(entry.Path); err != nil { continue } } diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index e6db845cb13..c577a22b03a 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -33,7 +33,7 @@ var ( isRunningSystemdOnce sync.Once isRunningSystemd bool - GenerateDeviceProps func(*configs.Resources) ([]systemdDbus.Property, error) + GenerateDeviceProps func(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) ) // NOTE: This function comes from package github.com/coreos/go-systemd/util @@ -342,7 +342,7 @@ func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems st // generateDeviceProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) { +func generateDeviceProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { if GenerateDeviceProps == nil { if len(r.Devices) > 0 { return nil, cgroups.ErrDevicesUnsupported @@ -350,5 +350,5 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err return nil, nil } - return GenerateDeviceProps(r) + return GenerateDeviceProps(r, systemdVersion(cm)) } diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index a26b5f29c70..ffe87798782 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -75,7 +75,7 @@ var legacySubsystems = []subsystem{ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property - deviceProperties, err := generateDeviceProperties(r) + deviceProperties, err := generateDeviceProperties(r, cm) if err != nil { return nil, err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index c5576441c47..b28ec6b22f2 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -214,7 +214,7 @@ func genV2ResourcesProperties(dirPath string, r *configs.Resources, cm *dbusConn // aren't the end of the world, but it is a bit concerning. However // it's unclear if systemd removes all eBPF programs attached when // doing SetUnitProperties... - deviceProperties, err := generateDeviceProperties(r) + deviceProperties, err := generateDeviceProperties(r, cm) if err != nil { return nil, err }