From 0c8e2cc602bab3c8c1127b50aa26fa6f6954e58b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 21 Nov 2023 00:36:28 +1100 Subject: [PATCH 1/5] *: actually support joining a userns with a new container (This is a cherry-pick of 1912d5988bbb379189ea9ceb2e03945738c513dc.) Our handling for name space paths with user namespaces has been broken for a long time. In particular, the need to parse /proc/self/*id_map in quite a few places meant that we would treat userns configurations that had a namespace path as if they were a userns configuration without mappings, resulting in errors. The primary issue was down to the id translation helper functions, which could only handle configurations that had explicit mappings. Obviously, when joining a user namespace we need to map the ids but figuring out the correct mapping is non-trivial in comparison. In order to get the mapping, you need to read /proc//*id_map of a process inside the userns -- while most userns paths will be of the form /proc//ns/user (and we have a fast-path for this case), this is not guaranteed and thus it is necessary to spawn a process inside the container and read its /proc//*id_map files in the general case. As Go does not allow us spawn a subprocess into a target userns, we have to use CGo to fork a sub-process which does the setns(2). To be honest, this is a little dodgy in regards to POSIX signal-safety(7) but since we do no allocations and we are executing in the forked context from a Go program (not a C program), it should be okay. The other alternative would be to do an expensive re-exec (a-la nsexec which would make several other bits of runc more complicated), or to use nsenter(1) which might not exist on the system and is less than ideal. Because we need to logically remap users quite a few times in runc (including in "runc init", where joining the namespace is not feasable), we cache the mapping inside the libcontainer config struct. A future patch will make sure that we stop allow invalid user configurations where a mapping is specified as well as a userns path to join. Finally, add an integration test to make sure we don't regress this again. Signed-off-by: Aleksa Sarai --- libcontainer/configs/validate/rootless.go | 31 ++-- libcontainer/specconv/spec_linux.go | 16 ++ libcontainer/userns/userns_maps.c | 79 ++++++++++ libcontainer/userns/userns_maps_linux.go | 171 ++++++++++++++++++++++ tests/integration/userns.bats | 83 ++++++++++- 5 files changed, 360 insertions(+), 20 deletions(-) create mode 100644 libcontainer/userns/userns_maps.c create mode 100644 libcontainer/userns/userns_maps_linux.go diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go index 9a6e5eb32a3..37c383366fe 100644 --- a/libcontainer/configs/validate/rootless.go +++ b/libcontainer/configs/validate/rootless.go @@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { return nil } -func hasIDMapping(id int, mappings []configs.IDMap) bool { - for _, m := range mappings { - if id >= m.ContainerID && id < m.ContainerID+m.Size { - return true - } - } - return false -} - func rootlessEUIDMappings(config *configs.Config) error { if !config.Namespaces.Contains(configs.NEWUSER) { return errors.New("rootless container requires user namespaces") } - - if len(config.UidMappings) == 0 { - return errors.New("rootless containers requires at least one UID mapping") - } - if len(config.GidMappings) == 0 { - return errors.New("rootless containers requires at least one GID mapping") + // We only require mappings if we are not joining another userns. + if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" { + if len(config.UidMappings) == 0 { + return errors.New("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return errors.New("rootless containers requires at least one GID mapping") + } } return nil } @@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(uid, config.UidMappings) { - return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers") + if _, err := config.HostUID(uid); err != nil { + return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err) } } @@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(gid, config.GidMappings) { - return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers") + if _, err := config.HostGID(gid); err != nil { + return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err) } } } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 1b358b24b4d..bef6a51df42 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/userns" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" @@ -939,6 +940,21 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { config.GidMappings = append(config.GidMappings, create(m)) } } + if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + // Cache the current userns mappings in our configuration, so that we + // can calculate uid and gid mappings within runc. These mappings are + // never used for configuring the container if the path is set. + uidMap, gidMap, err := userns.GetUserNamespaceMappings(path) + if err != nil { + return fmt.Errorf("failed to cache mappings for userns: %w", err) + } + config.UidMappings = uidMap + config.GidMappings = gidMap + logrus.WithFields(logrus.Fields{ + "uid_map": uidMap, + "gid_map": gidMap, + }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") + } rootUID, err := config.HostRootUID() if err != nil { return err diff --git a/libcontainer/userns/userns_maps.c b/libcontainer/userns/userns_maps.c new file mode 100644 index 00000000000..84f2c6188c3 --- /dev/null +++ b/libcontainer/userns/userns_maps.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +/* + * All of the code here is run inside an aync-signal-safe context, so we need + * to be careful to not call any functions that could cause issues. In theory, + * since we are a Go program, there are fewer restrictions in practice, it's + * better to be safe than sorry. + * + * The only exception is exit, which we need to call to make sure we don't + * return into runc. + */ + +void bail(int pipefd, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vdprintf(pipefd, fmt, args); + va_end(args); + + exit(1); +} + +int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd) +{ + char buffer[4096] = { 0 }; + + pid_t child = fork(); + if (child != 0) + return child; + /* in child */ + + /* Join the target userns. */ + int nsfd = open(userns_path, O_RDONLY); + if (nsfd < 0) + bail(errfd, "open userns path %s failed: %m", userns_path); + + int err = setns(nsfd, CLONE_NEWUSER); + if (err < 0) + bail(errfd, "setns %s failed: %m", userns_path); + + close(nsfd); + + /* Pipe the requested file contents. */ + int fd = open(path, O_RDONLY); + if (fd < 0) + bail(errfd, "open %s in userns %s failed: %m", path, userns_path); + + int nread, ntotal = 0; + while ((nread = read(fd, buffer, sizeof(buffer))) != 0) { + if (nread < 0) + bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal); + ntotal += nread; + + int nwritten = 0; + while (nwritten < nread) { + int n = write(outfd, buffer, nread - nwritten); + if (n < 0) + bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m", + nread - nwritten, path, nwritten); + nwritten += n; + } + if (nread != nwritten) + bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten); + } + + close(fd); + close(outfd); + close(errfd); + + /* We must exit here, otherwise we would return into a forked runc. */ + exit(0); +} diff --git a/libcontainer/userns/userns_maps_linux.go b/libcontainer/userns/userns_maps_linux.go new file mode 100644 index 00000000000..d81290de2cb --- /dev/null +++ b/libcontainer/userns/userns_maps_linux.go @@ -0,0 +1,171 @@ +//go:build linux + +package userns + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +/* +#include +extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd); +*/ +import "C" + +func parseIdmapData(data []byte) (ms []configs.IDMap, err error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + var m configs.IDMap + line := scanner.Text() + if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil { + return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err) + } + ms = append(ms, m) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("parsing id map failed: %w", err) + } + return ms, nil +} + +// Do something equivalent to nsenter --user= cat , but more +// efficiently. Returns the contents of the requested file from within the user +// namespace. +func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err) + } + defer rdr.Close() + defer wtr.Close() + + errRdr, errWtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err) + } + defer errRdr.Close() + defer errWtr.Close() + + cNsPath := C.CString(nsPath) + defer C.free(unsafe.Pointer(cNsPath)) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd())) + + if childPid < 0 { + return nil, fmt.Errorf("failed to spawn fork for userns") + } else if childPid == 0 { + // this should never happen + panic("runc executing inside fork child -- unsafe state!") + } + + // We are in the parent -- close the write end of the pipe before reading. + wtr.Close() + output, err := io.ReadAll(rdr) + rdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn failed: %w", err) + } + + // Ditto for the error pipe. + errWtr.Close() + errOutput, err := io.ReadAll(errRdr) + errRdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err) + } + errOutput = bytes.TrimSpace(errOutput) + + // Clean up the child. + child, err := os.FindProcess(int(childPid)) + if err != nil { + return nil, fmt.Errorf("could not find userns spawn process: %w", err) + } + state, err := child.Wait() + if err != nil { + return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err) + } + if !state.Success() { + errStr := string(errOutput) + if errStr == "" { + errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode()) + } + return nil, fmt.Errorf("userns spawn: %s", errStr) + } else if len(errOutput) > 0 { + // We can just ignore weird output in the error pipe if the process + // didn't bail(), but for completeness output for debugging. + logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput)) + } + // The subprocess succeeded, return whatever it wrote to the pipe. + return output, nil +} + +func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) { + var ( + pid int + extra rune + tryFastPath bool + ) + + // nsPath is usually of the form /proc//ns/user, which means that we + // already have a pid that is part of the user namespace and thus we can + // just use the pid to read from /proc//*id_map. + // + // Note that Sscanf doesn't consume the whole input, so we check for any + // trailing data with %c. That way, we can be sure the pattern matched + // /proc/$pid/ns/user _exactly_ iff n === 1. + if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 { + tryFastPath = pid > 0 + } + + for _, mapType := range []struct { + name string + idMap *[]configs.IDMap + }{ + {"uid_map", &uidMap}, + {"gid_map", &gidMap}, + } { + var mapData []byte + + if tryFastPath { + path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name) + data, err := os.ReadFile(path) + if err != nil { + // Do not error out here -- we need to try the slow path if the + // fast path failed. + logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err) + } else { + mapData = data + } + } else { + logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath) + } + + if mapData == nil { + // We have to actually join the namespace if we cannot take the + // fast path. The path is resolved with respect to the child + // process, so just use /proc/self. + data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name) + if err != nil { + return nil, nil, err + } + mapData = data + } + idMap, err := parseIdmapData(mapData) + if err != nil { + return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err) + } + *mapType.idMap = idMap + } + + return uidMap, gidMap, nil +} diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats index eca1e0ce58a..2329119a2fe 100644 --- a/tests/integration/userns.bats +++ b/tests/integration/userns.bats @@ -15,15 +15,25 @@ function setup() { mkdir -p rootfs/{proc,sys,tmp} mkdir -p rootfs/tmp/mount-{1,2} + to_umount_list="$(mktemp "$BATS_RUN_TMPDIR/userns-mounts.XXXXXX")" if [ "$ROOTLESS" -eq 0 ]; then update_config ' .linux.namespaces += [{"type": "user"}] | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] - | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + | .linux.gidMappings += [{"hostID": 200000, "containerID": 0, "size": 65534}] ' fi } function teardown() { teardown_bundle + + if [ -v to_umount_list ]; then + while read -r mount_path; do + umount -l "$mount_path" || : + rm -f "$mount_path" + done <"$to_umount_list" + rm -f "$to_umount_list" + unset to_umount_list + fi } @test "userns with simple mount" { @@ -83,3 +93,74 @@ function teardown() { runc exec test_busybox cat /sys/fs/cgroup/{pids,memory}/tasks [ "$status" -eq 0 ] } + +@test "userns join other container userns" { + # Create a detached container with the id-mapping we want. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Configure our container to attach to the first container's userns. + target_pid="$(__runc state target_userns | jq .pid)" + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "/proc/'"$target_pid"'/ns/" + .type) else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + runc run -d --console-socket "$CONSOLE_SOCKET" in_userns + [ "$status" -eq 0 ] + + runc exec in_userns cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + + runc exec in_userns cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} + +@test "userns join other container userns [bind-mounted nsfd]" { + requires root + + # Create a detached container with the id-mapping we want. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Bind-mount the first containers userns nsfd to a different path, to + # exercise the non-fast-path (where runc has to join the userns to get the + # mappings). + target_pid="$(__runc state target_userns | jq .pid)" + userns_path=$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX") + mount --bind "/proc/$target_pid/ns/user" "$userns_path" + echo "$userns_path" >>"$to_umount_list" + + # Configure our container to attach to the first container's userns. + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + runc run -d --console-socket "$CONSOLE_SOCKET" in_userns + [ "$status" -eq 0 ] + + runc exec in_userns cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + + runc exec in_userns cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} From 8f8cb45572c7574325c89ffdf8f5b8112a4e7a97 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 21 Nov 2023 17:20:36 +1100 Subject: [PATCH 2/5] configs: disallow ambiguous userns and timens configurations (This is a cherry-pick of 09822c3da8ad8fa91b8796c5abf27ef06814a0c3.) For userns and timens, the mappings (and offsets, respectively) cannot be changed after the namespace is first configured. Thus, configuring a container with a namespace path to join means that you cannot also provide configuration for said namespace. Previously we would silently ignore the configuration (and just join the provided path), but we really should be returning an error (especially when you consider that the configuration userns mappings are used quite a bit in runc with the assumption that they are the correct mapping for the userns -- but in this case they are not). In the case of userns, the mappings are also required if you _do not_ specify a path, while in the case of the time namespace you can have a container with a timens but no mappings specified. It should be noted that the case checking that the user has not specified a userns path and a userns mapping needs to be handled in specconv (as opposed to the configuration validator) because with this patchset we now cache the mappings of path-based userns configurations and thus the validator can't be sure whether the mapping is a cached mapping or a user-specified one. So we do the validation in specconv, and thus the test for this needs to be an integration test. Signed-off-by: Aleksa Sarai --- libcontainer/configs/validate/validator.go | 12 +++++-- .../configs/validate/validator_test.go | 10 +++--- libcontainer/integration/exec_test.go | 6 ++++ libcontainer/specconv/spec_linux.go | 5 +++ libcontainer/specconv/spec_linux_test.go | 34 +++++++++++++++++++ 5 files changed, 61 insertions(+), 6 deletions(-) diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 4fbd308dadd..ece70a45d31 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -109,11 +109,19 @@ func (v *ConfigValidator) security(config *configs.Config) error { func (v *ConfigValidator) usernamespace(config *configs.Config) error { if config.Namespaces.Contains(configs.NEWUSER) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - return errors.New("USER namespaces aren't enabled in the kernel") + return errors.New("user namespaces aren't enabled in the kernel") } + hasPath := config.Namespaces.PathOf(configs.NEWUSER) != "" + hasMappings := config.UidMappings != nil || config.GidMappings != nil + if !hasPath && !hasMappings { + return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified") + } + // The hasPath && hasMappings validation case is handled in specconv -- + // we cache the mappings in Config during specconv in the hasPath case, + // so we cannot do that validation here. } else { if config.UidMappings != nil || config.GidMappings != nil { - return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config") + return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config") } } return nil diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index 5181333fb12..e89a60b08c5 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -150,7 +150,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) { } } -func TestValidateUsernamespace(t *testing.T) { +func TestValidateUserNamespace(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { t.Skip("Test requires userns.") } @@ -161,6 +161,8 @@ func TestValidateUsernamespace(t *testing.T) { {Type: configs.NEWUSER}, }, ), + UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, + GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, } validator := New() @@ -170,11 +172,11 @@ func TestValidateUsernamespace(t *testing.T) { } } -func TestValidateUsernamespaceWithoutUserNS(t *testing.T) { - uidMap := configs.IDMap{ContainerID: 123} +func TestValidateUsernsMappingWithoutNamespace(t *testing.T) { config := &configs.Config{ Rootfs: "/var", - UidMappings: []configs.IDMap{uidMap}, + UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, + GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, } validator := New() diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 5c6272e5d42..670b33fcbb3 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -1588,6 +1589,11 @@ func TestInitJoinNetworkAndUser(t *testing.T) { config2 := newTemplateConfig(t, &tParam{userns: true}) config2.Namespaces.Add(configs.NEWNET, netns1) config2.Namespaces.Add(configs.NEWUSER, userns1) + // Emulate specconv.setupUserNamespace(). + uidMap, gidMap, err := userns.GetUserNamespaceMappings(userns1) + ok(t, err) + config2.UidMappings = uidMap + config2.GidMappings = gidMap config2.Cgroups.Path = "integration/test2" container2, err := newContainer(t, config2) ok(t, err) diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index bef6a51df42..0cbf578112d 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -941,6 +941,11 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { } } if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + // We cannot allow uid or gid mappings to be set if we are also asked + // to join a userns. + if config.UidMappings != nil || config.GidMappings != nil { + return errors.New("user namespaces enabled, but both namespace path and mapping specified -- you may only provide one") + } // Cache the current userns mappings in our configuration, so that we // can calculate uid and gid mappings within runc. These mappings are // never used for configuring the container if the path is set. diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index 79768e31861..68e57a08308 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -610,6 +610,40 @@ func TestDupNamespaces(t *testing.T) { } } +func TestUserNamespaceMappingAndPath(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("Test requires userns.") + } + + spec := &specs.Spec{ + Root: &specs.Root{ + Path: "rootfs", + }, + Linux: &specs.Linux{ + UIDMappings: []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000, Size: 1000}, + }, + GIDMappings: []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 2000, Size: 1000}, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "user", + Path: "/proc/1/ns/user", + }, + }, + }, + } + + _, err := CreateLibcontainerConfig(&CreateOpts{ + Spec: spec, + }) + + if !strings.Contains(err.Error(), "user namespaces enabled, but both namespace path and mapping specified") { + t.Errorf("user namespace with mapping and namespace path should be forbidden") + } +} + func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { t.Skip("Test requires userns.") From 2dd8368e5737e47526bf2412e1160a73911046dd Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 23 Nov 2023 12:40:07 +1100 Subject: [PATCH 3/5] integration: add mega-test for joining namespaces (This is a cherry-pick of 6fa8d068438ed47e8318448c34fc4587612a0740.) Given we've had several bugs in this behaviour that have now been fixed, add an integration test that makes sure that you can start a container that joins all of the namespaces of a second container. The only namespace we do not join is the mount namespace, because joining a namespace that has been pivot_root'd leads to a bunch of errors. In principle, removing everything from config.json that requires a mount _should_ work, but the root.path configuration is mandatory and we cannot just ignore setting up the rootfs in the namespace joining scenario (if the user has configured a different rootfs, we need to use it or error out, and there's no reasonable way of checking if if the rootfs paths are the same that doesn't result in spaghetti logic). Signed-off-by: Aleksa Sarai --- tests/integration/run.bats | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/integration/run.bats b/tests/integration/run.bats index e59ccb4a43d..88540c763d1 100644 --- a/tests/integration/run.bats +++ b/tests/integration/run.bats @@ -108,3 +108,60 @@ function teardown() { [ "$status" -eq 0 ] [ "$output" = "410" ] } + +@test "runc run [joining existing container namespaces]" { + # Create a detached container with the namespaces we want. We notably want + # to include userns, which requires config-related configuration. + if [ $EUID -eq 0 ]; then + update_config '.linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 100}] + | .linux.gidMappings += [{"containerID": 0, "hostID": 200000, "size": 200}]' + mkdir -p rootfs/{proc,sys,tmp} + fi + update_config '.process.args = ["sleep", "infinity"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" target_ctr + [ "$status" -eq 0 ] + + # Modify our container's configuration such that it is just going to + # inherit all of the namespaces of the target container. + # + # NOTE: We cannot join the mount namespace of another container because of + # some quirks of the runtime-spec. In particular, we MUST pivot_root into + # root.path and root.path MUST be set in the config, so runc cannot just + # ignore root.path when joining namespaces (and root.path doesn't exist + # inside root.path, for obvious reasons). + # + # We could hack around this (create a copy of the rootfs inside the rootfs, + # or use a simpler mount namespace target), but those wouldn't be similar + # tests to the other namespace joining tests. + target_pid="$(__runc state target_ctr | jq .pid)" + update_config '.linux.namespaces |= map_values(.path = if .type == "mount" then "" else "/proc/'"$target_pid"'/ns/" + ({"network": "net", "mount": "mnt"}[.type] // .type) end)' + # Remove the userns configuration (it cannot be changed). + update_config '.linux |= (del(.uidMappings) | del(.gidMappings))' + + runc run -d --console-socket "$CONSOLE_SOCKET" attached_ctr + [ "$status" -eq 0 ] + + # Make sure there are two sleep processes in our container. + runc exec attached_ctr ps aux + [ "$status" -eq 0 ] + run -0 grep "sleep infinity" <<<"$output" + [ "${#lines[@]}" -eq 2 ] + + # ... that the userns mappings are the same... + runc exec attached_ctr cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+100$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + runc exec attached_ctr cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+200$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} From e65d4cac663aa4e720cf14ba1bfd3a8900801976 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 6 Dec 2023 22:54:53 +1100 Subject: [PATCH 4/5] specconv: temporarily allow userns path and mapping if they match (This is a cherry-pick of ebcef3e651e61aeee96546301d8db9e92b505ce6.) It turns out that the error added in commit 09822c3da8ad ("configs: disallow ambiguous userns and timens configurations") causes issues with containerd and CRIO because they pass both userns mappings and a userns path. These configurations are broken, but to avoid the regression in this one case, output a warning to tell the user that the configuration is incorrect but we will continue to use it if and only if the configured mappings are identical to the mappings of the provided namespace. Fixes: 09822c3da8ad ("configs: disallow ambiguous userns and timens configurations") Signed-off-by: Aleksa Sarai --- libcontainer/specconv/spec_linux.go | 24 +++++++++++++++++++----- libcontainer/specconv/spec_linux_test.go | 4 ++-- libcontainer/userns/userns_maps_linux.go | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 0cbf578112d..e0bb528f39a 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -941,11 +941,6 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { } } if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { - // We cannot allow uid or gid mappings to be set if we are also asked - // to join a userns. - if config.UidMappings != nil || config.GidMappings != nil { - return errors.New("user namespaces enabled, but both namespace path and mapping specified -- you may only provide one") - } // Cache the current userns mappings in our configuration, so that we // can calculate uid and gid mappings within runc. These mappings are // never used for configuring the container if the path is set. @@ -953,6 +948,25 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { if err != nil { return fmt.Errorf("failed to cache mappings for userns: %w", err) } + // We cannot allow uid or gid mappings to be set if we are also asked + // to join a userns. + if config.UidMappings != nil || config.GidMappings != nil { + // FIXME: It turns out that containerd and CRIO pass both a userns + // path and the mappings of the namespace in the same config.json. + // Such a configuration is technically not valid, but we used to + // require mappings be specified, and thus users worked around our + // bug -- so we can't regress it at the moment. But we also don't + // want to produce broken behaviour if the mapping doesn't match + // the userns. So (for now) we output a warning if the actual + // userns mappings match the configuration, otherwise we return an + // error. + if !userns.IsSameMapping(uidMap, config.UidMappings) || + !userns.IsSameMapping(gidMap, config.GidMappings) { + return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one") + } + logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on if you see this warning and cannot update your configuration.") + } + config.UidMappings = uidMap config.GidMappings = gidMap logrus.WithFields(logrus.Fields{ diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index 68e57a08308..f79ea0e3727 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -639,8 +639,8 @@ func TestUserNamespaceMappingAndPath(t *testing.T) { Spec: spec, }) - if !strings.Contains(err.Error(), "user namespaces enabled, but both namespace path and mapping specified") { - t.Errorf("user namespace with mapping and namespace path should be forbidden") + if !strings.Contains(err.Error(), "both namespace path and non-matching mapping specified") { + t.Errorf("user namespace with path and non-matching mapping should be forbidden, got error %v", err) } } diff --git a/libcontainer/userns/userns_maps_linux.go b/libcontainer/userns/userns_maps_linux.go index d81290de2cb..7a8c2b023b3 100644 --- a/libcontainer/userns/userns_maps_linux.go +++ b/libcontainer/userns/userns_maps_linux.go @@ -169,3 +169,18 @@ func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, er return uidMap, gidMap, nil } + +// IsSameMapping returns whether or not the two id mappings are the same. Note +// that if the order of the mappings is different, or a mapping has been split, +// the mappings will be considered different. +func IsSameMapping(a, b []configs.IDMap) bool { + if len(a) != len(b) { + return false + } + for idx := range a { + if a[idx] != b[idx] { + return false + } + } + return true +} From 617db78515a5fbaef9038f43ac0da5aa09adee88 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 8 Dec 2023 11:07:51 +1100 Subject: [PATCH 5/5] configs: make id mappings int64 to better handle 32-bit (This is a cherry-pick of 482e56379a1c2f8b2573f07c06013510e59f9eb8.) Using ints for all of our mapping structures means that a 32-bit binary errors out when trying to parse /proc/self/*id_map: failed to cache mappings for userns: failed to parse uid_map of userns /proc/1/ns/user: parsing id map failed: invalid format in line " 0 0 4294967295": integer overflow on token 4294967295 This issue was unearthed by commit 1912d5988bbb ("*: actually support joining a userns with a new container") but the underlying issue has been present since the docker/libcontainer days. In theory, switching to uint32 (to match the spec) instead of int64 would also work, but keeping everything signed seems much less error-prone. It's also important to note that a mapping might be too large for an int on 32-bit, so we detect this during the mapping. Signed-off-by: Aleksa Sarai --- libcontainer/configs/config.go | 6 +++--- libcontainer/configs/config_linux.go | 30 ++++++++++++++++++++++------ libcontainer/container_linux.go | 2 +- libcontainer/specconv/spec_linux.go | 6 +++--- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index c1b4a0041c2..6ebf5ec7b60 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -21,9 +21,9 @@ type Rlimit struct { // IDMap represents UID/GID Mappings for User Namespaces. type IDMap struct { - ContainerID int `json:"container_id"` - HostID int `json:"host_id"` - Size int `json:"size"` + ContainerID int64 `json:"container_id"` + HostID int64 `json:"host_id"` + Size int64 `json:"size"` } // Seccomp represents syscall restrictions diff --git a/libcontainer/configs/config_linux.go b/libcontainer/configs/config_linux.go index 8c02848b70a..51fe940748a 100644 --- a/libcontainer/configs/config_linux.go +++ b/libcontainer/configs/config_linux.go @@ -1,6 +1,10 @@ package configs -import "errors" +import ( + "errors" + "fmt" + "math" +) var ( errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") @@ -16,11 +20,18 @@ func (c Config) HostUID(containerId int) (int, error) { if c.UidMappings == nil { return -1, errNoUIDMap } - id, found := c.hostIDFromMapping(containerId, c.UidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings) if !found { return -1, errNoUserMap } - return id, nil + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setuid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) + } + return int(id), nil } // Return unchanged id. return containerId, nil @@ -39,11 +50,18 @@ func (c Config) HostGID(containerId int) (int, error) { if c.GidMappings == nil { return -1, errNoGIDMap } - id, found := c.hostIDFromMapping(containerId, c.GidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings) if !found { return -1, errNoGroupMap } - return id, nil + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setgid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) + } + return int(id), nil } // Return unchanged id. return containerId, nil @@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) { // Utility function that gets a host ID for a container ID from user namespace map // if that ID is present in the map. -func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { +func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) { for _, m := range uMap { if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { hostID := m.HostID + (containerID - m.ContainerID) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 2f17e37d43d..59aa0338ac6 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -2268,7 +2268,7 @@ func ignoreTerminateErrors(err error) error { func requiresRootOrMappingTool(c *configs.Config) bool { gidMap := []configs.IDMap{ - {ContainerID: 0, HostID: os.Getegid(), Size: 1}, + {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1}, } return !reflect.DeepEqual(c.GidMappings, gidMap) } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index e0bb528f39a..7dbfb8691d0 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -927,9 +927,9 @@ next: func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { create := func(m specs.LinuxIDMapping) configs.IDMap { return configs.IDMap{ - HostID: int(m.HostID), - ContainerID: int(m.ContainerID), - Size: int(m.Size), + HostID: int64(m.HostID), + ContainerID: int64(m.ContainerID), + Size: int64(m.Size), } } if spec.Linux != nil {