From 094bc673ef0ccfe535564a153ce1997592bb8754 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 20 Jun 2024 11:37:19 +0200 Subject: [PATCH 1/4] libpod: unlock the thread if possible Signed-off-by: Giuseppe Scrivano --- libpod/oci_conmon_linux.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index e62489400027..91090bba2f96 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -45,8 +45,13 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption return 0, err } defer func() { - if err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS); err != nil { - logrus.Errorf("Unable to clone new namespace: %q", err) + err := unix.Setns(int(fd.Fd()), unix.CLONE_NEWNS) + if err == nil { + // If we are able to reset the previous mount namespace, unlock the thread and reuse it + runtime.UnlockOSThread() + } else { + // otherwise, leave the thread locked and the Go runtime will terminate it + logrus.Errorf("Unable to reset the previous mount namespace: %q", err) } }() From c81f075f436466092372dec7a19c35fe387fe8d3 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 18 Jun 2024 17:05:01 +0200 Subject: [PATCH 2/4] libpod: do not chmod bind mounts with the new mount API is available, the OCI runtime doesn't require that each parent directory for a bind mount must be accessible. Instead it is opened in the initial user namespace and passed down to the container init process. This requires that the kernel supports the new mount API and that the OCI runtime uses it. Signed-off-by: Giuseppe Scrivano --- libpod/container_internal_common.go | 9 --------- libpod/oci_conmon_common.go | 8 +++----- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go index 70f6f741f5de..65afbf027418 100644 --- a/libpod/container_internal_common.go +++ b/libpod/container_internal_common.go @@ -1917,15 +1917,6 @@ func (c *Container) makeBindMounts() error { return fmt.Errorf("assigning mounts to container %s: %w", c.ID(), err) } } - - if !hasCurrentUserMapped(c) { - if err := makeAccessible(resolvPath, c.RootUID(), c.RootGID()); err != nil { - return err - } - if err := makeAccessible(hostsPath, c.RootUID(), c.RootGID()); err != nil { - return err - } - } } else { if !c.config.UseImageResolvConf { if err := c.createResolvConf(); err != nil { diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 548286af0f47..d72f7eb14c54 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -183,16 +183,14 @@ func hasCurrentUserMapped(ctr *Container) bool { // CreateContainer creates a container. func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { - // always make the run dir accessible to the current user so that the PID files can be read without + // always make the container directory accessible to the current user so that the PID files can be read without // being in the rootless user namespace. if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { return 0, err } if !hasCurrentUserMapped(ctr) { - for _, i := range []string{ctr.state.RunDir, ctr.runtime.config.Engine.TmpDir, ctr.config.StaticDir, ctr.state.Mountpoint, ctr.runtime.config.Engine.VolumePath} { - if err := makeAccessible(i, ctr.RootUID(), ctr.RootGID()); err != nil { - return 0, err - } + if err := makeAccessible(ctr.state.Mountpoint, ctr.RootUID(), ctr.RootGID()); err != nil { + return 0, err } // if we are running a non privileged container, be sure to umount some kernel paths so they are not From 08a8429459fdeddec73ffb8e5efe339f6312dba1 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 18 Jun 2024 23:09:55 +0200 Subject: [PATCH 3/4] libpod: avoid chowning the rundir to root in the userns so it is possible to remove the code to make the entire directory world accessible. Signed-off-by: Giuseppe Scrivano --- libpod/container_internal.go | 10 ---------- libpod/container_internal_common.go | 4 ---- libpod/oci_conmon_common.go | 5 ----- 3 files changed, 19 deletions(-) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index d1eeb7f85194..29a32a1c3ce6 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -544,16 +544,6 @@ func (c *Container) setupStorage(ctx context.Context) error { c.config.StaticDir = containerInfo.Dir c.state.RunDir = containerInfo.RunDir - if len(c.config.IDMappings.UIDMap) != 0 || len(c.config.IDMappings.GIDMap) != 0 { - if err := idtools.SafeChown(containerInfo.RunDir, c.RootUID(), c.RootGID()); err != nil { - return err - } - - if err := idtools.SafeChown(containerInfo.Dir, c.RootUID(), c.RootGID()); err != nil { - return err - } - } - // Set the default Entrypoint and Command if containerInfo.Config != nil { // Set CMD in the container to the default configuration only if ENTRYPOINT is not set by the user. diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go index 65afbf027418..216074e13081 100644 --- a/libpod/container_internal_common.go +++ b/libpod/container_internal_common.go @@ -1834,10 +1834,6 @@ func (c *Container) mountIntoRootDirs(mountName string, mountPath string) error // Make standard bind mounts to include in the container func (c *Container) makeBindMounts() error { - if err := idtools.SafeChown(c.state.RunDir, c.RootUID(), c.RootGID()); err != nil { - return fmt.Errorf("cannot chown run directory: %w", err) - } - if c.state.BindMounts == nil { c.state.BindMounts = make(map[string]string) } diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index d72f7eb14c54..ff0e7908600c 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -183,11 +183,6 @@ func hasCurrentUserMapped(ctr *Container) bool { // CreateContainer creates a container. func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { - // always make the container directory accessible to the current user so that the PID files can be read without - // being in the rootless user namespace. - if err := makeAccessible(ctr.state.RunDir, 0, 0); err != nil { - return 0, err - } if !hasCurrentUserMapped(ctr) { if err := makeAccessible(ctr.state.Mountpoint, ctr.RootUID(), ctr.RootGID()); err != nil { return 0, err From 49eb5af3013ef64bd017a1fd545c802e3eb230a1 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Wed, 19 Jun 2024 23:49:37 +0200 Subject: [PATCH 4/4] libpod: intermediate mount if UID not mapped into the userns if the current user is not mapped into the new user namespace, use an intermediate mount to allow the mount point to be accessible instead of opening up all the parent directories for the mountpoint. Closes: https://github.com/containers/podman/issues/23028 Signed-off-by: Giuseppe Scrivano --- libpod/container_internal.go | 71 +++++++++++++++++++++++++ libpod/container_internal_common.go | 6 ++- libpod/oci_conmon_common.go | 31 +---------- libpod/oci_conmon_freebsd.go | 2 +- libpod/oci_conmon_linux.go | 82 +++++++++++++++++++++++------ 5 files changed, 145 insertions(+), 47 deletions(-) diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 29a32a1c3ce6..a9c076085ae9 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -14,6 +14,8 @@ import ( "slices" "strconv" "strings" + "sync" + "syscall" "time" metadata "github.com/checkpoint-restore/checkpointctl/lib" @@ -2362,6 +2364,75 @@ func (c *Container) setupOCIHooks(ctx context.Context, config *spec.Spec) (map[s return allHooks, nil } +// getRootPathForOCI returns the root path to use for the OCI runtime. +// If the current user is mapped in the container user namespace, then it returns +// the container's mountpoint directly from the storage. +// Otherwise, it returns an intermediate mountpoint that is accessible to anyone. +func (c *Container) getRootPathForOCI() (string, error) { + if hasCurrentUserMapped(c) { + return c.state.Mountpoint, nil + } + return c.getIntermediateMountpointUser() +} + +var ( + intermediateMountPoint string + intermediateMountPointErr error + intermediateMountPointSync sync.Mutex +) + +// getIntermediateMountpointUser returns a path that is accessible to everyone. It must be on TMPDIR since +// the runroot/tmpdir used by libpod are accessible only to the owner. +// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID. +// The path can be used by different containers, so a mount must be created only in a private mount namespace. +func (c *Container) recreateIntermediateMountpointUser() (string, error) { + uid := os.Geteuid() + gid := os.Getegid() + for i := 0; ; i++ { + tmpDir := os.Getenv("TMPDIR") + if tmpDir == "" { + tmpDir = "/tmp" + } + dir := filepath.Join(tmpDir, fmt.Sprintf("intermediate-mountpoint-%d.%d", rootless.GetRootlessUID(), i)) + err := os.Mkdir(dir, 0755) + if err != nil { + if !errors.Is(err, os.ErrExist) { + return "", err + } + st, err2 := os.Stat(dir) + if err2 != nil { + return "", err + } + sys := st.Sys().(*syscall.Stat_t) + if !st.IsDir() || sys.Uid != uint32(uid) || sys.Gid != uint32(gid) { + continue + } + } + return dir, nil + } +} + +// getIntermediateMountpointUser returns a path that is accessible to everyone. +// To avoid TOCTOU issues, the path must be owned by the current user's UID and GID. +// The path can be used by different containers, so a mount must be created only in a private mount namespace. +func (c *Container) getIntermediateMountpointUser() (string, error) { + intermediateMountPointSync.Lock() + defer intermediateMountPointSync.Unlock() + + if intermediateMountPoint == "" || fileutils.Exists(intermediateMountPoint) != nil { + return c.recreateIntermediateMountpointUser() + } + + // update the timestamp to prevent systemd-tmpfiles from removing it + now := time.Now() + if err := os.Chtimes(intermediateMountPoint, now, now); err != nil { + if !errors.Is(err, os.ErrNotExist) { + return c.recreateIntermediateMountpointUser() + } + } + return intermediateMountPoint, intermediateMountPointErr +} + // mount mounts the container's root filesystem func (c *Container) mount() (string, error) { if c.state.State == define.ContainerStateRemoving { diff --git a/libpod/container_internal_common.go b/libpod/container_internal_common.go index 216074e13081..f6500c636279 100644 --- a/libpod/container_internal_common.go +++ b/libpod/container_internal_common.go @@ -565,7 +565,11 @@ func (c *Container) generateSpec(ctx context.Context) (s *spec.Spec, cleanupFunc return nil, nil, err } - g.SetRootPath(c.state.Mountpoint) + rootPath, err := c.getRootPathForOCI() + if err != nil { + return nil, nil, err + } + g.SetRootPath(rootPath) g.AddAnnotation("org.opencontainers.image.stopSignal", strconv.FormatUint(uint64(c.config.StopSignal), 10)) if _, exists := g.Config.Annotations[annotations.ContainerManager]; !exists { diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index ff0e7908600c..c32fba46e283 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -184,15 +184,10 @@ func hasCurrentUserMapped(ctr *Container) bool { // CreateContainer creates a container. func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { if !hasCurrentUserMapped(ctr) { - if err := makeAccessible(ctr.state.Mountpoint, ctr.RootUID(), ctr.RootGID()); err != nil { - return 0, err - } - // if we are running a non privileged container, be sure to umount some kernel paths so they are not // bind mounted inside the container at all. - if !ctr.config.Privileged && !rootless.IsRootless() { - return r.createRootlessContainer(ctr, restoreOptions) - } + hideFiles := !ctr.config.Privileged && !rootless.IsRootless() + return r.createRootlessContainer(ctr, restoreOptions, hideFiles) } return r.createOCIContainer(ctr, restoreOptions) } @@ -972,28 +967,6 @@ func (r *ConmonOCIRuntime) RuntimeInfo() (*define.ConmonInfo, *define.OCIRuntime return &conmon, &ocirt, nil } -// makeAccessible changes the path permission and each parent directory to have --x--x--x -func makeAccessible(path string, uid, gid int) error { - for ; path != "/"; path = filepath.Dir(path) { - st, err := os.Stat(path) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - if int(st.Sys().(*syscall.Stat_t).Uid) == uid && int(st.Sys().(*syscall.Stat_t).Gid) == gid { - continue - } - if st.Mode()&0111 != 0111 { - if err := os.Chmod(path, st.Mode()|0111); err != nil { - return err - } - } - } - return nil -} - // Wait for a container which has been sent a signal to stop func waitContainerStop(ctr *Container, timeout time.Duration) error { return waitPidStop(ctr.state.PID, timeout) diff --git a/libpod/oci_conmon_freebsd.go b/libpod/oci_conmon_freebsd.go index 5f113f5cbab1..f681f785a1df 100644 --- a/libpod/oci_conmon_freebsd.go +++ b/libpod/oci_conmon_freebsd.go @@ -8,7 +8,7 @@ import ( "os/exec" ) -func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) { return -1, errors.New("unsupported (*ConmonOCIRuntime) createRootlessContainer") } diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 91090bba2f96..05dc65d360e2 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -3,7 +3,9 @@ package libpod import ( + "errors" "fmt" + "io/fs" "os" "os/exec" "path/filepath" @@ -25,7 +27,7 @@ import ( "golang.org/x/sys/unix" ) -func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) { +func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions, hideFiles bool) (int64, error) { type result struct { restoreDuration int64 err error @@ -40,6 +42,11 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption } defer errorhandling.CloseQuiet(fd) + rootPath, err := ctr.getRootPathForOCI() + if err != nil { + return 0, err + } + // create a new mountns on the current thread if err = unix.Unshare(unix.CLONE_NEWNS); err != nil { return 0, err @@ -54,26 +61,69 @@ func (r *ConmonOCIRuntime) createRootlessContainer(ctr *Container, restoreOption logrus.Errorf("Unable to reset the previous mount namespace: %q", err) } }() - - // don't spread our mounts around. We are setting only /sys to be slave - // so that the cleanup process is still able to umount the storage and the - // changes are propagated to the host. - err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") - if err != nil { - return 0, fmt.Errorf("cannot make /sys slave: %w", err) - } - mounts, err := pmount.GetMounts() if err != nil { return 0, err } - for _, m := range mounts { - if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { - continue + if rootPath != "" { + byMountpoint := make(map[string]*pmount.Info) + for _, m := range mounts { + byMountpoint[m.Mountpoint] = m + } + isShared := false + var parentMount string + for dir := filepath.Dir(rootPath); ; dir = filepath.Dir(dir) { + if m, found := byMountpoint[dir]; found { + parentMount = dir + for _, o := range strings.Split(m.Optional, ",") { + opt := strings.Split(o, ":") + if opt[0] == "shared" { + isShared = true + break + } + } + break + } + if dir == "/" { + return 0, fmt.Errorf("cannot find mountpoint for the root path") + } + } + + // do not propagate the bind mount on the parent mount namespace + if err := unix.Mount("", parentMount, "", unix.MS_SLAVE, ""); err != nil { + return 0, fmt.Errorf("failed to make %s slave: %w", parentMount, err) + } + + // bind mount the containers' mount path to the path where the OCI runtime expects it to be + if err := unix.Mount(ctr.state.Mountpoint, rootPath, "", unix.MS_BIND, ""); err != nil { + return 0, fmt.Errorf("failed to bind mount %s to %s: %w", ctr.state.Mountpoint, rootPath, err) + } + + if isShared { + // we need to restore the shared propagation of the parent mount so that we don't break -v $SRC:$DST:shared in the container + // if $SRC is on the same mount as the root path + if err := unix.Mount("", parentMount, "", unix.MS_SHARED, ""); err != nil { + return 0, fmt.Errorf("failed to restore MS_SHARED propagation for %s: %w", parentMount, err) + } + } + } + + if hideFiles { + // don't spread our mounts around. We are setting only /sys to be slave + // so that the cleanup process is still able to umount the storage and the + // changes are propagated to the host. + err = unix.Mount("/sys", "/sys", "none", unix.MS_REC|unix.MS_SLAVE, "") + if err != nil { + return 0, fmt.Errorf("cannot make /sys slave: %w", err) } - err = unix.Unmount(m.Mountpoint, 0) - if err != nil && !os.IsNotExist(err) { - return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + for _, m := range mounts { + if !strings.HasPrefix(m.Mountpoint, "/sys/kernel") { + continue + } + err = unix.Unmount(m.Mountpoint, 0) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return 0, fmt.Errorf("cannot unmount %s: %w", m.Mountpoint, err) + } } } return r.createOCIContainer(ctr, restoreOptions)