From 3b0cdba4da55188b8ae49430f2730fade56b43ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:00 +0100 Subject: [PATCH 01/21] fs: record sequence number of origin mount namespace Store the sequence number of the mount namespace the anonymous mount namespace has been created from. This information will be used in follow-up patches. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-1-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/mount.h | 1 + fs/namespace.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index ffb613cdfeee..820a79f1f735 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -20,6 +20,7 @@ struct mnt_namespace { wait_queue_head_t poll; struct rcu_head mnt_ns_rcu; }; + u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; diff --git a/fs/namespace.c b/fs/namespace.c index a3ed3f2980cb..d5c85b6ecb98 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2853,15 +2853,30 @@ static int do_loopback(struct path *path, const char *old_name, static struct file *open_detached_copy(struct path *path, bool recursive) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; + struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; struct file *file; + ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); + + /* + * Record the sequence number of the source mount namespace. + * This needs to hold namespace_sem to ensure that the mount + * doesn't get attached. + */ + if (is_mounted(path->mnt)) { + src_mnt_ns = real_mount(path->mnt)->mnt_ns; + if (is_anon_ns(src_mnt_ns)) + ns->seq_origin = src_mnt_ns->seq_origin; + else + ns->seq_origin = src_mnt_ns->seq; + } + mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); From 2f576220cd8129a303ef4bac4b08e3937c42eff8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:01 +0100 Subject: [PATCH 02/21] fs: add mnt_ns_empty() helper Add a helper that checks whether a give mount namespace is empty instead of open-coding the specific data structure check. This also be will be used in follow-up patches. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-2-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/mount.h | 5 +++++ fs/namespace.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 820a79f1f735..e2501a724688 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -157,6 +157,11 @@ static inline bool mnt_ns_attached(const struct mount *mnt) return !RB_EMPTY_NODE(&mnt->mnt_node); } +static inline bool mnt_ns_empty(const struct mnt_namespace *ns) +{ + return RB_EMPTY_ROOT(&ns->mounts); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { struct mnt_namespace *ns = mnt->mnt_ns; diff --git a/fs/namespace.c b/fs/namespace.c index d5c85b6ecb98..8ca28737f4ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5315,7 +5315,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ - if (RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; @@ -5340,7 +5340,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, int err; /* Has the namespace already been emptied? */ - if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); From 043bc81efbfc5580ecd9da76118722649205f30a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:02 +0100 Subject: [PATCH 03/21] fs: add assert for move_mount() After we've attached a detached mount tree the anonymous mount namespace must be empty. Add an assert and make this assumption explicit. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-3-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 8ca28737f4ee..c9670a198b50 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3423,10 +3423,13 @@ static int do_move_mount(struct path *old_path, struct path *new_path, out: unlock_mount(mp); if (!err) { - if (attached) + if (attached) { mntput_no_expire(parent); - else + } else { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); + } } return err; } From b73ec10a458796bed6d51768d5f26c4660f62ae4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:03 +0100 Subject: [PATCH 04/21] fs: add fastpath for dissolve_on_fput() Instead of acquiring the namespace semaphore and the mount lock everytime we close a file with FMODE_NEED_UNMOUNT set add a fastpath that checks whether we need to at all. Most of the time the caller will have attached the mount to the filesystem hierarchy and there's nothing to do. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-4-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 59 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index c9670a198b50..8cb15539aaaf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2246,22 +2246,57 @@ struct vfsmount *collect_mounts(const struct path *path) static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); +static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +{ + /* + * This mount belonged to an anonymous mount namespace + * but was moved to a non-anonymous mount namespace and + * then unmounted. + */ + if (unlikely(!mnt_ns)) + return false; + + /* + * This mount belongs to a non-anonymous mount namespace + * and we know that such a mount can never transition to + * an anonymous mount namespace again. + */ + if (!is_anon_ns(mnt_ns)) { + /* + * A detached mount either belongs to an anonymous mount + * namespace or a non-anonymous mount namespace. It + * should never belong to something purely internal. + */ + VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); + return false; + } + + return true; +} + void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; - namespace_lock(); - lock_mount_hash(); - ns = real_mount(mnt)->mnt_ns; - if (ns) { - if (is_anon_ns(ns)) - umount_tree(real_mount(mnt), UMOUNT_CONNECTED); - else - ns = NULL; + struct mount *m = real_mount(mnt); + + scoped_guard(rcu) { + if (!must_dissolve(READ_ONCE(m->mnt_ns))) + return; } - unlock_mount_hash(); - namespace_unlock(); - if (ns) - free_mnt_ns(ns); + + scoped_guard(rwsem_write, &namespace_sem) { + ns = m->mnt_ns; + if (!must_dissolve(ns)) + return; + + lock_mount_hash(); + umount_tree(m, UMOUNT_CONNECTED); + unlock_mount_hash(); + } + + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) From 9ed72af4e292e770f6977a032de32112b079c2ce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:04 +0100 Subject: [PATCH 05/21] fs: add may_copy_tree() Add a helper that verifies whether a caller may copy a given mount tree. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-5-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 8cb15539aaaf..631404fa623c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2808,6 +2808,41 @@ static int do_change_type(struct path *path, int ms_flags) return err; } +/* may_copy_tree() - check if a mount tree can be copied + * @path: path to the mount tree to be copied + * + * This helper checks if the caller may copy the mount tree starting + * from @path->mnt. The caller may copy the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller tries to copy an nfs mount referring to a mount + * namespace, i.e., the caller is trying to copy a mount namespace + * entry from nsfs. + * (3) The caller tries to copy a pidfs mount referring to a pidfd. + * + * Returns true if the mount tree can be copied, false otherwise. + */ +static inline bool may_copy_tree(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + const struct dentry_operations *d_op; + + if (check_mnt(mnt)) + return true; + + d_op = path->dentry->d_op; + if (d_op == &ns_dentry_operations) + return true; + + if (d_op == &pidfs_dentry_operations) + return true; + + return false; +} + static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); @@ -2815,13 +2850,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old)) { - const struct dentry_operations *d_op = old_path->dentry->d_op; - - if (d_op != &ns_dentry_operations && - d_op != &pidfs_dentry_operations) - return mnt; - } + if (!may_copy_tree(old_path)) + return mnt; if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; From c5c12f871a302f6fae60f9f2840dad18adf049ce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:05 +0100 Subject: [PATCH 06/21] fs: create detached mounts from detached mounts Add the ability to create detached mounts from detached mounts. Currently, detached mounts can only be created from attached mounts. This limitaton prevents various use-cases. For example, the ability to mount a subdirectory without ever having to make the whole filesystem visible first. The current permission model for the OPEN_TREE_CLONE flag of the open_tree() system call is: (1) Check that the caller is privileged over the owning user namespace of it's current mount namespace. (2) Check that the caller is located in the mount namespace of the mount it wants to create a detached copy of. While it is not strictly necessary to do it this way it is consistently applied in the new mount api. This model will also be used when allowing the creation of detached mount from another detached mount. The (1) requirement can simply be met by performing the same check as for the non-detached case, i.e., verify that the caller is privileged over its current mount namespace. To meet the (2) requirement it must be possible to infer the origin mount namespace that the anonymous mount namespace of the detached mount was created from. The origin mount namespace of an anonymous mount is the mount namespace that the mounts that were copied into the anonymous mount namespace originate from. The origin mount namespace of the anonymous mount namespace must be the same as the caller's mount namespace. To establish this the sequence number of the caller's mount namespace and the origin sequence number of the anonymous mount namespace are compared. The caller is always located in a non-anonymous mount namespace since anonymous mount namespaces cannot be setns()ed into. The caller's mount namespace will thus always have a valid sequence number. The owning namespace of any mount namespace, anonymous or non-anonymous, can never change. A mount attached to a non-anonymous mount namespace can never change mount namespace. If the sequence number of the non-anonymous mount namespace and the origin sequence number of the anonymous mount namespace match, the owning namespaces must match as well. Hence, the capability check on the owning namespace of the caller's mount namespace ensures that the caller has the ability to copy the mount tree. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-6-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 631404fa623c..36f5102ef3d1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -998,6 +998,12 @@ static inline int check_mnt(struct mount *mnt) return mnt->mnt_ns == current->nsproxy->mnt_ns; } +static inline bool check_anonymous_mnt(struct mount *mnt) +{ + return is_anon_ns(mnt->mnt_ns) && + mnt->mnt_ns->seq_origin == current->nsproxy->mnt_ns->seq; +} + /* * vfsmount lock must be held for write */ @@ -2822,6 +2828,32 @@ static int do_change_type(struct path *path, int ms_flags) * namespace, i.e., the caller is trying to copy a mount namespace * entry from nsfs. * (3) The caller tries to copy a pidfs mount referring to a pidfd. + * (4) The caller is trying to copy a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * This is not strictly necessary. The current semantics of the new + * mount api enforce that the caller must be located in the same + * mount namespace as the mount tree it interacts with. Using the + * origin sequence number preserves these semantics even for + * anonymous mount namespaces. However, one could envision extending + * the api to directly operate across mount namespace if needed. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to copy the mount tree. * * Returns true if the mount tree can be copied, false otherwise. */ @@ -2840,9 +2872,13 @@ static inline bool may_copy_tree(struct path *path) if (d_op == &pidfs_dentry_operations) return true; - return false; + if (!is_mounted(path->mnt)) + return false; + + return check_anonymous_mnt(mnt); } + static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); From 3f1724dd567e5a5beb12aab19c08d364a20ed89c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:06 +0100 Subject: [PATCH 07/21] selftests: create detached mounts from detached mounts Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-7-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 70f65eb320a7..885d98f2d889 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -126,6 +126,26 @@ #endif #endif +#ifndef __NR_move_mount + #if defined __alpha__ + #define __NR_move_mount 539 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_move_mount 4429 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_move_mount 6429 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_move_mount 5429 + #endif + #elif defined __ia64__ + #define __NR_move_mount (428 + 1024) + #else + #define __NR_move_mount 429 + #endif +#endif + #ifndef MOUNT_ATTR_IDMAP #define MOUNT_ATTR_IDMAP 0x00100000 #endif @@ -157,6 +177,51 @@ static inline int sys_open_tree(int dfd, const char *filename, unsigned int flag return syscall(__NR_open_tree, dfd, filename, flags); } +/* move_mount() flags */ +#ifndef MOVE_MOUNT_F_SYMLINKS +#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ +#endif + +#ifndef MOVE_MOUNT_F_AUTOMOUNTS +#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ +#endif + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#endif + +#ifndef MOVE_MOUNT_T_SYMLINKS +#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ +#endif + +#ifndef MOVE_MOUNT_T_AUTOMOUNTS +#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ +#endif + +#ifndef MOVE_MOUNT_T_EMPTY_PATH +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#endif + +#ifndef MOVE_MOUNT_SET_GROUP +#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ +#endif + +#ifndef MOVE_MOUNT_BENEATH +#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ +#endif + +#ifndef MOVE_MOUNT__MASK +#define MOVE_MOUNT__MASK 0x00000377 +#endif + +static inline int sys_move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, + to_pathname, flags); +} + static ssize_t write_nointr(int fd, const void *buf, size_t count) { ssize_t ret; @@ -397,6 +462,10 @@ FIXTURE_SETUP(mount_setattr) ASSERT_EQ(mkdir("/tmp/B/BB", 0777), 0); + ASSERT_EQ(mkdir("/tmp/target1", 0777), 0); + + ASSERT_EQ(mkdir("/tmp/target2", 0777), 0); + ASSERT_EQ(mount("testing", "/tmp/B/BB", "tmpfs", MS_NOATIME | MS_NODEV, "size=100000,mode=700"), 0); @@ -1506,4 +1575,218 @@ TEST_F(mount_setattr, mount_attr_nosymfollow) ASSERT_EQ(close(fd), 0); } +TEST_F(mount_setattr, open_tree_detached) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /AA testing tmpfs + * `-/AA/B testing tmpfs + * `-/AA/B/BB testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_subdir, "B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(move_mount(fd_tree_subdir, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + /* + * /tmp/target1 testing tmpfs + * `-/tmp/target1/B testing tmpfs + * `-/tmp/target1/B/BB testing tmpfs + */ + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target1/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target1/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target2", MOVE_MOUNT_F_EMPTY_PATH), 0); + /* + * /tmp/target2 testing tmpfs + * |-/tmp/target2/A testing tmpfs + * | `-/tmp/target2/A/AA testing tmpfs + * | `-/tmp/target2/A/AA/B testing tmpfs + * | `-/tmp/target2/A/AA/B/BB testing tmpfs + * `-/tmp/target2/B testing ramfs + */ + ASSERT_EQ(statx(-EBADF, "/tmp/target2", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, open_tree_detached_fail) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + /* + * The origin mount namespace of the anonymous mount namespace + * of @fd_tree_base doesn't match the caller's mount namespace + * anymore so creation of another detached mounts must fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EINVAL); +} + +TEST_F(mount_setattr, open_tree_detached_fail2) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(create_and_enter_userns(), 0); + + /* + * The caller entered a new user namespace. They will have + * CAP_SYS_ADMIN in this user namespace. However, they're still + * located in a mount namespace that is owned by an ancestor + * user namespace in which they hold no privilege. Creating a + * detached mount must thus fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EPERM); +} + +TEST_F(mount_setattr, open_tree_detached_fail3) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(prepare_unpriv_mountns(), 0); + + /* + * The caller entered a new mount namespace. They will have + * CAP_SYS_ADMIN in the owning user namespace of their mount + * namespace. + * + * However, the origin mount namespace of the anonymous mount + * namespace of @fd_tree_base doesn't match the caller's mount + * namespace anymore so creation of another detached mounts must + * fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EINVAL); +} + TEST_HARNESS_MAIN From f9fde814de3755d5d3818fe51244c45699f7252e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:07 +0100 Subject: [PATCH 08/21] fs: support getname_maybe_null() in move_mount() Allow move_mount() to work with NULL path arguments. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-8-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 93 ++++++++++++++++++++++++++++++---------------- include/linux/fs.h | 1 + 2 files changed, 61 insertions(+), 33 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 36f5102ef3d1..265876800d5f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2465,6 +2465,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) enum mnt_tree_flags_t { MNT_TREE_MOVE = BIT(0), MNT_TREE_BENEATH = BIT(1), + MNT_TREE_PROPAGATION = BIT(2), }; /** @@ -3436,8 +3437,8 @@ static int can_move_mount_beneath(const struct path *from, return 0; } -static int do_move_mount(struct path *old_path, struct path *new_path, - bool beneath) +static int do_move_mount(struct path *old_path, + struct path *new_path, enum mnt_tree_flags_t flags) { struct mnt_namespace *ns; struct mount *p; @@ -3445,8 +3446,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, struct mount *parent; struct mountpoint *mp, *old_mp; int err; - bool attached; - enum mnt_tree_flags_t flags = 0; + bool attached, beneath = flags & MNT_TREE_BENEATH; mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) @@ -3547,7 +3547,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, false); + err = do_move_mount(&old_path, path, 0); path_put(&old_path); return err; } @@ -4388,6 +4388,21 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, return ret; } +static inline int vfs_move_mount(struct path *from_path, struct path *to_path, + enum mnt_tree_flags_t mflags) +{ + int ret; + + ret = security_move_mount(from_path, to_path); + if (ret) + return ret; + + if (mflags & MNT_TREE_PROPAGATION) + return do_set_group(from_path, to_path); + + return do_move_mount(from_path, to_path, mflags); +} + /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination @@ -4401,8 +4416,12 @@ SYSCALL_DEFINE5(move_mount, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { - struct path from_path, to_path; - unsigned int lflags; + struct path to_path __free(path_put) = {}; + struct path from_path __free(path_put) = {}; + struct filename *to_name __free(putname) = NULL; + struct filename *from_name __free(putname) = NULL; + unsigned int lflags, uflags; + enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) @@ -4415,43 +4434,51 @@ SYSCALL_DEFINE5(move_mount, (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; - /* If someone gives a pathname, they aren't permitted to move - * from an fd that requires unmount as we can't get at the flag - * to clear it afterwards. - */ + if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; + if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; + lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; - - ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); - if (ret < 0) - return ret; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; + to_name = getname_maybe_null(to_pathname, uflags); + if (IS_ERR(to_name)) + return PTR_ERR(to_name); - ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); - if (ret < 0) - goto out_from; + if (!to_name && to_dfd >= 0) { + CLASS(fd_raw, f_to)(to_dfd); + if (fd_empty(f_to)) + return -EBADF; - ret = security_move_mount(&from_path, &to_path); - if (ret < 0) - goto out_to; + to_path = fd_file(f_to)->f_path; + path_get(&to_path); + } else { + ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); + if (ret) + return ret; + } - if (flags & MOVE_MOUNT_SET_GROUP) - ret = do_set_group(&from_path, &to_path); - else - ret = do_move_mount(&from_path, &to_path, - (flags & MOVE_MOUNT_BENEATH)); + if (!from_name && from_dfd >= 0) { + CLASS(fd_raw, f_from)(from_dfd); + if (fd_empty(f_from)) + return -EBADF; -out_to: - path_put(&to_path); -out_from: - path_put(&from_path); - return ret; + return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); + } + + ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); + if (ret) + return ret; + + return vfs_move_mount(&from_path, &to_path, mflags); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index e71d58c7f59c..7e9df867538d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2855,6 +2855,7 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f return __getname_maybe_null(name); } extern void putname(struct filename *name); +DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); From 21107723831e96f8652c587961e33e3f75b72fc0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:08 +0100 Subject: [PATCH 09/21] fs: mount detached mounts onto detached mounts Currently, detached mounts can only be mounted onto attached mounts. This limitation makes it impossible to assemble a new private rootfs and move it into place. That's an extremely powerful concept for container and service workloads that we should support. Right now, a detached tree must be created, attached, then it can gain additional mounts and then it can either be moved (if it doesn't reside under a shared mount) or a detached mount created again. Lift this restriction. In order to allow mounting detached mounts onto other detached mounts the same permission model used for creating detached mounts from detached mounts can be used: (1) Check that the caller is privileged over the owning user namespace of it's current mount namespace. (2) Check that the caller is located in the mount namespace of the mount it wants to create a detached copy of. The origin mount namespace of the anonymous mount namespace must be the same as the caller's mount namespace. To establish this the sequence number of the caller's mount namespace and the origin sequence number of the anonymous mount namespace are compared. The caller is always located in a non-anonymous mount namespace since anonymous mount namespaces cannot be setns()ed into. The caller's mount namespace will thus always have a valid sequence number. The owning namespace of any mount namespace, anonymous or non-anonymous, can never change. A mount attached to a non-anonymous mount namespace can never change mount namespace. If the sequence number of the non-anonymous mount namespace and the origin sequence number of the anonymous mount namespace match, the owning namespaces must match as well. Hence, the capability check on the owning namespace of the caller's mount namespace ensures that the caller has the ability to attach the mount tree. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-9-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 265876800d5f..7f7ae1b2ce3a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2295,6 +2295,24 @@ void dissolve_on_fput(struct vfsmount *mnt) if (!must_dissolve(ns)) return; + /* + * After must_dissolve() we know that this is a detached + * mount in an anonymous mount namespace. + * + * Now when mnt_has_parent() reports that this mount + * tree has a parent, we know that this anonymous mount + * tree has been moved to another anonymous mount + * namespace. + * + * So when closing this file we cannot unmount the mount + * tree. This will be done when the file referring to + * the root of the anonymous mount namespace will be + * closed (It could already be closed but it would sync + * on @namespace_sem and wait for us to finish.). + */ + if (mnt_has_parent(m)) + return; + lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); @@ -3437,6 +3455,54 @@ static int can_move_mount_beneath(const struct path *from, return 0; } +/* may_use_mount() - check if a mount tree can be used + * @mnt: vfsmount to be used + * + * This helper checks if the caller may use the mount tree starting + * from @path->mnt. The caller may use the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller is trying to use a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to use the mount tree. + * + * Returns true if the mount tree can be used, false otherwise. + */ +static inline bool may_use_mount(struct mount *mnt) +{ + if (check_mnt(mnt)) + return true; + + /* + * Make sure that noone unmounted the target path or somehow + * managed to get their hands on something purely kernel + * internal. + */ + if (!is_mounted(&mnt->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + static int do_move_mount(struct path *old_path, struct path *new_path, enum mnt_tree_flags_t flags) { @@ -3462,8 +3528,14 @@ static int do_move_mount(struct path *old_path, ns = old->mnt_ns; err = -EINVAL; - /* The mountpoint must be in our namespace. */ - if (!check_mnt(p)) + if (!may_use_mount(p)) + goto out; + + /* + * Don't allow moving an attached mount tree to an anonymous + * mount tree. + */ + if (!is_anon_ns(ns) && is_anon_ns(p->mnt_ns)) goto out; /* The thing moved must be mounted... */ @@ -3474,6 +3546,16 @@ static int do_move_mount(struct path *old_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; + /* + * Ending up with two files referring to the root of the same + * anonymous mount namespace would cause an error as this would + * mean trying to move the same mount twice into the mount tree + * which would be rejected later. But be explicit about it right + * here. + */ + if (is_anon_ns(ns) && is_anon_ns(p->mnt_ns) && ns == p->mnt_ns) + goto out; + if (old->mnt.mnt_flags & MNT_LOCKED) goto out; From d83a55b826e3f830f0772bd9d7f68b4ed63d2122 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:09 +0100 Subject: [PATCH 10/21] selftests: first test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-10-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 885d98f2d889..0641d2e22e02 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1789,4 +1789,44 @@ TEST_F(mount_setattr, open_tree_detached_fail3) ASSERT_EQ(errno, EINVAL); } +TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_then_close) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* + * /mnt testing tmpfs + * `-/mnt testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(statx(fd_tree_subdir, "", AT_EMPTY_PATH, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_NE(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + TEST_HARNESS_MAIN From 89ddfd4af1b246a893d5e85faf637f129d2bc1f5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:10 +0100 Subject: [PATCH 11/21] selftests: second test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-11-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 0641d2e22e02..082fdf19a9be 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1829,4 +1829,53 @@ TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_then_close) EXPECT_EQ(close(fd_tree_subdir), 0); } +TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_and_attach) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + __u64 mnt_id = 0; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* + * /mnt testing tmpfs + * `-/mnt testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(statx(fd_tree_subdir, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_TRUE(stx.stx_mask & STATX_MNT_ID_UNIQUE); + mnt_id = stx.stx_mnt_id; + + ASSERT_NE(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, STATX_MNT_ID_UNIQUE, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_TRUE(stx.stx_mask & STATX_MNT_ID_UNIQUE); + ASSERT_EQ(stx.stx_mnt_id, mnt_id); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + TEST_HARNESS_MAIN From 4fd0aea1ea7a64801caa5f0b149004f418270554 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:11 +0100 Subject: [PATCH 12/21] selftests: third test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-12-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 082fdf19a9be..66eed84c6a01 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1878,4 +1878,35 @@ TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_and_attach) EXPECT_EQ(close(fd_tree_subdir), 0); } +TEST_F(mount_setattr, move_mount_detached_fail) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + + /* Attach the mount to the caller's mount namespace. */ + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(-EBADF, "/tmp/B", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + ASSERT_EQ(statx(fd_tree_subdir, "BB", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* Not allowed to move an attached mount to a detached mount. */ + ASSERT_NE(move_mount(fd_tree_base, "", fd_tree_subdir, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(errno, EINVAL); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + TEST_HARNESS_MAIN From 1768f9bd53c97a47a3a8ece94ab5436833d71007 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:12 +0100 Subject: [PATCH 13/21] selftests: fourth test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-13-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 66eed84c6a01..0ee4e5b27c68 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1909,4 +1909,34 @@ TEST_F(mount_setattr, move_mount_detached_fail) EXPECT_EQ(close(fd_tree_subdir), 0); } +TEST_F(mount_setattr, attach_detached_mount_then_umount_then_close) +{ + int fd_tree = -EBADF; + struct statx stx; + + fd_tree = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx), 0); + /* We copied with AT_RECURSIVE so /mnt/A must be a mountpoint. */ + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* Attach the mount to the caller's mount namespace. */ + ASSERT_EQ(move_mount(fd_tree, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(umount2("/tmp/target1", MNT_DETACH), 0); + + /* + * This tests whether dissolve_on_fput() handles a NULL mount + * namespace correctly, i.e., that it doesn't splat. + */ + EXPECT_EQ(close(fd_tree), 0); +} + TEST_HARNESS_MAIN From d346ae7a641a02d76a6d9ee9ce2b7602e9e3965f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:13 +0100 Subject: [PATCH 14/21] selftests: fifth test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-14-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 0ee4e5b27c68..938c9e33ac00 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -1939,4 +1939,82 @@ TEST_F(mount_setattr, attach_detached_mount_then_umount_then_close) EXPECT_EQ(close(fd_tree), 0); } +TEST_F(mount_setattr, mount_detached1_onto_detached2_then_close_detached1_then_mount_detached2_onto_attached) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * `-/mnt/B testing ramfs + */ + fd_tree2 = sys_open_tree(-EBADF, "/mnt/B", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree2, 0); + + /* + * Move the source detached mount tree to the target detached + * mount tree. This will move all the mounts in the source mount + * tree from the source anonymous mount namespace to the target + * anonymous mount namespace. + * + * The source detached mount tree and the target detached mount + * tree now both refer to the same anonymous mount namespace. + * + * |-"" testing ramfs + * `-"" testing tmpfs + * `-""/AA testing tmpfs + * `-""/AA/B testing tmpfs + * `-""/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree1, "", fd_tree2, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + /* + * The source detached mount tree @fd_tree1 is now an attached + * mount, i.e., it has a parent. Specifically, it now has the + * root mount of the mount tree of @fd_tree2 as its parent. + * + * That means we are no longer allowed to attach it as we only + * allow attaching the root of an anonymous mount tree, not + * random bits and pieces. Verify that the kernel enforces this. + */ + ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + /* + * Closing the source detached mount tree must not unmount and + * free the shared anonymous mount namespace. The kernel will + * quickly yell at us because the anonymous mount namespace + * won't be empty when it's freed. + */ + EXPECT_EQ(close(fd_tree1), 0); + + /* + * Attach the mount tree to a non-anonymous mount namespace. + * This can only succeed if closing fd_tree1 had proper + * semantics and didn't cause the anonymous mount namespace to + * be freed. If it did this will trigger a UAF which will be + * visible on any KASAN enabled kernel. + * + * |-/tmp/target1 testing ramfs + * `-/tmp/target1 testing tmpfs + * `-/tmp/target1/AA testing tmpfs + * `-/tmp/target1/AA/B testing tmpfs + * `-/tmp/target1/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + EXPECT_EQ(close(fd_tree2), 0); +} + TEST_HARNESS_MAIN From 0e6eef95dfc908f6156be01aefee23a553c02afd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:14 +0100 Subject: [PATCH 15/21] selftests: sixth test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-15-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 938c9e33ac00..d2bf8b77df3f 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -2017,4 +2017,48 @@ TEST_F(mount_setattr, mount_detached1_onto_detached2_then_close_detached1_then_m EXPECT_EQ(close(fd_tree2), 0); } +TEST_F(mount_setattr, two_detached_mounts_referring_to_same_anonymous_mount_namespace) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * Copy the following mount tree: + * + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * Create an O_PATH file descriptors with a separate struct file + * that refers to the same detached mount tree as @fd_tree1 + */ + fd_tree2 = sys_open_tree(fd_tree1, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree2, 0); + + /* + * Copy the following mount tree: + * + * |-/tmp/target1 testing tmpfs + * `-/tmp/target1/AA testing tmpfs + * `-/tmp/target1/AA/B testing tmpfs + * `-/tmp/target1/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + /* + * This must fail as this would mean adding the same mount tree + * into the same mount tree. + */ + ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); +} + TEST_HARNESS_MAIN From 7bece690a990177ee4dda9640b4d7236e360d826 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:15 +0100 Subject: [PATCH 16/21] selftests: seventh test for mounting detached mounts onto detached mounts Add a test to verify that detached mounts behave correctly. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-16-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index d2bf8b77df3f..1e0508cb5c2d 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -2061,4 +2061,40 @@ TEST_F(mount_setattr, two_detached_mounts_referring_to_same_anonymous_mount_name ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); } +TEST_F(mount_setattr, two_detached_subtrees_of_same_anonymous_mount_namespace) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * Copy the following mount tree: + * + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * Create an O_PATH file descriptors with a separate struct file that + * refers to a subtree of the same detached mount tree as @fd_tree1 + */ + fd_tree2 = sys_open_tree(fd_tree1, "AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree2, 0); + + /* + * This must fail as it is only possible to attach the root of a + * detached mount tree. + */ + ASSERT_NE(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); +} + TEST_HARNESS_MAIN From f8b6cd66e479b67a506303954373494a4bbbda2c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 25 Feb 2025 10:31:34 +0100 Subject: [PATCH 17/21] fs: allow creating detached mounts from fsmount() file descriptors The previous patch series only enabled the creation of detached mounts from detached mounts that were created via open_tree(). In such cases we know that the origin sequence number for the newly created anonymous mount namespace will be set to the sequence number of the mount namespace the source mount belonged to. But fsmount() creates an anonymous mount namespace that does not have an origin mount namespace as the anonymous mount namespace was derived from a filesystem context created via fsopen(). Account for this case and allow the creation of detached mounts from mounts created via fsmount(). Consequently, any such detached mount created from an fsmount() mount will also have a zero origin sequence number. This allows to mount subdirectories without ever having to expose the filesystem to a a non-anonymous mount namespace: fd_context = sys_fsopen("tmpfs", 0); sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fd_tmpfs = sys_fsmount(fd_context, 0, 0); mkdirat(fd_tmpfs, "subdir", 0755); fd_tree = sys_open_tree(fd_tmpfs, "subdir", OPEN_TREE_CLONE); sys_move_mount(fd_tree, "", -EBADF, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); Signed-off-by: Christian Brauner --- fs/namespace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 7f7ae1b2ce3a..2200225a455a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1000,8 +1000,13 @@ static inline int check_mnt(struct mount *mnt) static inline bool check_anonymous_mnt(struct mount *mnt) { - return is_anon_ns(mnt->mnt_ns) && - mnt->mnt_ns->seq_origin == current->nsproxy->mnt_ns->seq; + u64 seq; + + if (!is_anon_ns(mnt->mnt_ns)) + return false; + + seq = mnt->mnt_ns->seq_origin; + return !seq || (seq == current->nsproxy->mnt_ns->seq); } /* From 064fe6e233e8d54d1b26bfabc38ed96d798c7416 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 25 Feb 2025 11:15:46 +0100 Subject: [PATCH 18/21] mount: handle mount propagation for detached mount trees In commit ee2e3f50629f ("mount: fix mounting of detached mounts onto targets that reside on shared mounts") I fixed a bug where propagating the source mount tree of an anonymous mount namespace into a target mount tree of a non-anonymous mount namespace could be used to trigger an integer overflow in the non-anonymous mount namespace causing any new mounts to fail. The cause of this was that the propagation algorithm was unable to recognize mounts from the source mount tree that were already propagated into the target mount tree and then reappeared as propagation targets when walking the destination propagation mount tree. When fixing this I disabled mount propagation into anonymous mount namespaces. Make it possible for anonymous mount namespace to receive mount propagation events correctly. This is no also a correctness issue now that we allow mounting detached mount trees onto detached mount trees. Mark the source anonymous mount namespace with MNTNS_PROPAGATING indicating that all mounts belonging to this mount namespace are currently in the process of being propagated and make the propagation algorithm discard those if they appear as propagation targets. Link: https://lore.kernel.org/r/20250225-work-mount-propagation-v1-1-e6e3724500eb@kernel.org Signed-off-by: Christian Brauner --- fs/mount.h | 7 +++++++ fs/namespace.c | 42 +++++++++++++++++++++++++++--------------- fs/pnode.c | 10 +++++----- fs/pnode.h | 2 +- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index e2501a724688..96862eba2246 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,12 @@ #include #include +extern struct list_head notify_list; + +typedef __u32 __bitwise mntns_flags_t; + +#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) + struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -27,6 +33,7 @@ struct mnt_namespace { struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ + mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index a7da41e81673..59894e3f3e68 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3536,13 +3536,6 @@ static int do_move_mount(struct path *old_path, if (!may_use_mount(p)) goto out; - /* - * Don't allow moving an attached mount tree to an anonymous - * mount tree. - */ - if (!is_anon_ns(ns) && is_anon_ns(p->mnt_ns)) - goto out; - /* The thing moved must be mounted... */ if (!is_mounted(&old->mnt)) goto out; @@ -3551,15 +3544,31 @@ static int do_move_mount(struct path *old_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; - /* - * Ending up with two files referring to the root of the same - * anonymous mount namespace would cause an error as this would - * mean trying to move the same mount twice into the mount tree - * which would be rejected later. But be explicit about it right - * here. - */ - if (is_anon_ns(ns) && is_anon_ns(p->mnt_ns) && ns == p->mnt_ns) + if (is_anon_ns(ns)) { + /* + * Ending up with two files referring to the root of the + * same anonymous mount namespace would cause an error + * as this would mean trying to move the same mount + * twice into the mount tree which would be rejected + * later. But be explicit about it right here. + */ + if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) + goto out; + + /* + * If this is an anonymous mount tree ensure that mount + * propagation can detect mounts that were just + * propagated to the target mount tree so we don't + * propagate onto them. + */ + ns->mntns_flags |= MNTNS_PROPAGATING; + } else if (is_anon_ns(p->mnt_ns)) { + /* + * Don't allow moving an attached mount tree to an + * anonymous mount tree. + */ goto out; + } if (old->mnt.mnt_flags & MNT_LOCKED) goto out; @@ -3603,6 +3612,9 @@ static int do_move_mount(struct path *old_path, if (err) goto out; + if (is_anon_ns(ns)) + ns->mntns_flags &= ~MNTNS_PROPAGATING; + /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); diff --git a/fs/pnode.c b/fs/pnode.c index ef048f008bdd..6dd26ca96cdd 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -150,7 +150,7 @@ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { @@ -174,7 +174,7 @@ static struct mount *skip_propagation_subtree(struct mount *m, * Advance m such that propagation_next will not return * the slaves of m. */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) m = last_slave(m); return m; @@ -185,7 +185,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) while (1) { while (1) { struct mount *next; - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { @@ -226,7 +226,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp) struct mount *child; int type; /* skip ones added by this propagate_mnt() */ - if (IS_MNT_NEW(m)) + if (IS_MNT_PROPAGATED(m)) return 0; /* skip if mountpoint isn't covered by it */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) @@ -380,7 +380,7 @@ bool propagation_would_overmount(const struct mount *from, if (!IS_MNT_SHARED(from)) return false; - if (IS_MNT_NEW(to)) + if (IS_MNT_PROPAGATED(to)) return false; if (to->mnt.mnt_root != mp->m_dentry) diff --git a/fs/pnode.h b/fs/pnode.h index 0b02a6393891..ddafe0d087ca 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -12,7 +12,7 @@ #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) -#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns)) +#define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING)) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) From 99b6a1dee086e69d242c268a547714cae71d9e59 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 09:11:54 +0100 Subject: [PATCH 19/21] fs: namespace: fix uninitialized variable use clang correctly notices that the 'uflags' variable initialization only happens in some cases: fs/namespace.c:4622:6: error: variable 'uflags' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] 4622 | if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/namespace.c:4623:48: note: uninitialized use occurs here 4623 | from_name = getname_maybe_null(from_pathname, uflags); | ^~~~~~ fs/namespace.c:4622:2: note: remove the 'if' if its condition is always true 4622 | if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: b1e9423d65e3 ("fs: support getname_maybe_null() in move_mount()") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250226081201.1876195-1-arnd@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 2200225a455a..a7da41e81673 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4527,6 +4527,7 @@ SYSCALL_DEFINE5(move_mount, lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + uflags = 0; if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; from_name = getname_maybe_null(from_pathname, uflags); if (IS_ERR(from_name)) @@ -4535,6 +4536,7 @@ SYSCALL_DEFINE5(move_mount, lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + uflags = 0; if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; to_name = getname_maybe_null(to_pathname, uflags); if (IS_ERR(to_name)) From 12e40cbb0ee5951359b759655cef832d72f9397f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 25 Feb 2025 11:15:47 +0100 Subject: [PATCH 20/21] selftests: add test for detached mount tree propagation Test that detached mount trees receive propagation events. Link: https://lore.kernel.org/r/20250225-work-mount-propagation-v1-2-e6e3724500eb@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 1e0508cb5c2d..33fa1f53fdf5 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -2097,4 +2097,74 @@ TEST_F(mount_setattr, two_detached_subtrees_of_same_anonymous_mount_namespace) ASSERT_EQ(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); } +TEST_F(mount_setattr, detached_tree_propagation) +{ + int fd_tree = -EBADF; + struct statx stx1, stx2, stx3, stx4; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(mount(NULL, "/mnt", NULL, MS_REC | MS_SHARED, NULL), 0); + + /* + * Copy the following mount tree: + * + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + fd_tree = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(statx(-EBADF, "/mnt/A", 0, 0, &stx1), 0); + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx2), 0); + + /* + * Copying the mount namespace like done above doesn't alter the + * mounts in any way so the filesystem mounted on /mnt must be + * identical even though the mounts will differ. Use the device + * information to verify that. Note that tmpfs will have a 0 + * major number so comparing the major number is misleading. + */ + ASSERT_EQ(stx1.stx_dev_minor, stx2.stx_dev_minor); + + /* Mount a tmpfs filesystem over /mnt/A. */ + ASSERT_EQ(mount(NULL, "/mnt/A", "tmpfs", 0, NULL), 0); + + + ASSERT_EQ(statx(-EBADF, "/mnt/A", 0, 0, &stx3), 0); + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx4), 0); + + /* + * A new filesystem has been mounted on top of /mnt/A which + * means that the device information will be different for any + * statx() that was taken from /mnt/A before the mount compared + * to one after the mount. + * + * Since we already now that the device information between the + * stx1 and stx2 samples are identical we also now that stx2 and + * stx3 device information will necessarily differ. + */ + ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor); + + /* + * If mount propagation worked correctly then the tmpfs mount + * that was created after the mount namespace was unshared will + * have propagated onto /mnt/A in the detached mount tree. + * + * Verify that the device information for stx3 and stx4 are + * identical. It is already established that stx3 is different + * from both stx1 and stx2 sampled before the tmpfs mount was + * done so if stx3 and stx4 are identical the proof is done. + */ + ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor); + + EXPECT_EQ(close(fd_tree), 0); +} + TEST_HARNESS_MAIN From e5c10b19b3a048512f91758d0f615597fbe68399 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 25 Feb 2025 11:15:48 +0100 Subject: [PATCH 21/21] selftests: test subdirectory mounting This tests mounting a subdirectory without ever having to expose the filesystem to a non-anonymous mount namespace. Link: https://lore.kernel.org/r/20250225-work-mount-propagation-v1-3-e6e3724500eb@kernel.org Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 81 +++++++++---------- 1 file changed, 36 insertions(+), 45 deletions(-) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 33fa1f53fdf5..48a000cabc97 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -20,6 +20,7 @@ #include #include +#include "../filesystems/overlayfs/wrappers.h" #include "../kselftest_harness.h" #ifndef CLONE_NEWNS @@ -177,51 +178,6 @@ static inline int sys_open_tree(int dfd, const char *filename, unsigned int flag return syscall(__NR_open_tree, dfd, filename, flags); } -/* move_mount() flags */ -#ifndef MOVE_MOUNT_F_SYMLINKS -#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */ -#endif - -#ifndef MOVE_MOUNT_F_AUTOMOUNTS -#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */ -#endif - -#ifndef MOVE_MOUNT_F_EMPTY_PATH -#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ -#endif - -#ifndef MOVE_MOUNT_T_SYMLINKS -#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */ -#endif - -#ifndef MOVE_MOUNT_T_AUTOMOUNTS -#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ -#endif - -#ifndef MOVE_MOUNT_T_EMPTY_PATH -#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ -#endif - -#ifndef MOVE_MOUNT_SET_GROUP -#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#endif - -#ifndef MOVE_MOUNT_BENEATH -#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ -#endif - -#ifndef MOVE_MOUNT__MASK -#define MOVE_MOUNT__MASK 0x00000377 -#endif - -static inline int sys_move_mount(int from_dfd, const char *from_pathname, - int to_dfd, const char *to_pathname, - unsigned int flags) -{ - return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, - to_pathname, flags); -} - static ssize_t write_nointr(int fd, const void *buf, size_t count) { ssize_t ret; @@ -1789,6 +1745,41 @@ TEST_F(mount_setattr, open_tree_detached_fail3) ASSERT_EQ(errno, EINVAL); } +TEST_F(mount_setattr, open_tree_subfolder) +{ + int fd_context, fd_tmpfs, fd_tree; + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + + EXPECT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "subdir", 0755), 0); + + fd_tree = sys_open_tree(fd_tmpfs, "subdir", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + EXPECT_EQ(close(fd_tmpfs), 0); + + ASSERT_EQ(mkdirat(-EBADF, "/mnt/open_tree_subfolder", 0755), 0); + + ASSERT_EQ(sys_move_mount(fd_tree, "", -EBADF, "/mnt/open_tree_subfolder", MOVE_MOUNT_F_EMPTY_PATH), 0); + + EXPECT_EQ(close(fd_tree), 0); + + ASSERT_EQ(umount2("/mnt/open_tree_subfolder", 0), 0); + + EXPECT_EQ(rmdir("/mnt/open_tree_subfolder"), 0); +} + TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_then_close) { int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF;