From 1e6ed33cabba8f06f532f2e5851a102602823734 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 7 Jun 2025 13:43:32 +0100 Subject: [PATCH 1/7] btrfs: fix failure to rebuild free space tree using multiple transactions If we are rebuilding a free space tree, while modifying the free space tree we may need to allocate a new metadata block group. If we end up using multiple transactions for the rebuild, when we call btrfs_end_transaction() we enter btrfs_create_pending_block_groups() which calls add_block_group_free_space() to add items to the free space tree for the block group. Then later during the free space tree rebuild, at btrfs_rebuild_free_space_tree(), we may find such new block groups and call populate_free_space_tree() for them, which fails with -EEXIST because there are already items in the free space tree. Then we abort the transaction with -EEXIST at btrfs_rebuild_free_space_tree(). Notice that we say "may find" the new block groups because a new block group may be inserted in the block groups rbtree, which is being iterated by the rebuild process, before or after the current node where the rebuild process is currently at. Syzbot recently reported such case which produces a trace like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 1 PID: 7626 at fs/btrfs/free-space-tree.c:1341 btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 Modules linked in: CPU: 1 UID: 0 PID: 7626 Comm: syz.2.25 Not tainted 6.15.0-rc7-syzkaller-00085-gd7fa1af5b33e-dirty #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 lr : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 sp : ffff80009c4f7740 x29: ffff80009c4f77b0 x28: ffff0000d4c3f400 x27: 0000000000000000 x26: dfff800000000000 x25: ffff70001389eee8 x24: 0000000000000003 x23: 1fffe000182b6e7b x22: 0000000000000000 x21: ffff0000c15b73d8 x20: 00000000ffffffef x19: ffff0000c15b7378 x18: 1fffe0003386f276 x17: ffff80008f31e000 x16: ffff80008adbe98c x15: 0000000000000001 x14: 1fffe0001b281550 x13: 0000000000000000 x12: 0000000000000000 x11: ffff60001b281551 x10: 0000000000000003 x9 : 1c8922000a902c00 x8 : 1c8922000a902c00 x7 : ffff800080485878 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff80008047843c x2 : 0000000000000001 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 (P) btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 irq event stamp: 330 hardirqs last enabled at (329): [] raw_spin_rq_unlock_irq kernel/sched/sched.h:1525 [inline] hardirqs last enabled at (329): [] finish_lock_switch+0xb0/0x1c0 kernel/sched/core.c:5130 hardirqs last disabled at (330): [] el1_dbg+0x24/0x80 arch/arm64/kernel/entry-common.c:511 softirqs last enabled at (10): [] local_bh_enable+0x10/0x34 include/linux/bottom_half.h:32 softirqs last disabled at (8): [] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 ---[ end trace 0000000000000000 ]--- Fix this by flagging new block groups which had their free space tree entries already added and then skip them in the rebuild process. Also, since the rebuild may be triggered when doing a remount, make sure that when we clear an existing free space tree that we clear such flag from every existing block group, otherwise we would skip those block groups during the rebuild. Reported-by: syzbot+d0014fb0fc39c5487ae5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/68460a54.050a0220.daf97.0af5.GAE@google.com/ Fixes: 882af9f13e83 ("btrfs: handle free space tree rebuild in multiple transactions") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 2 ++ fs/btrfs/free-space-tree.c | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9de356bcb411..aa176cc9a324 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -83,6 +83,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Set after we add a new block group to the free space tree. */ + BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a3e2a2a81461..a83c268f7f87 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1241,6 +1241,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; + struct rb_node *node; int nr; int ret; @@ -1269,6 +1270,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, btrfs_release_path(path); } + node = rb_first_cached(&trans->fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *bg; + + bg = rb_entry(node, struct btrfs_block_group, cache_node); + clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags); + node = rb_next(node); + cond_resched(); + } + return 0; } @@ -1358,12 +1369,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); + + if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, + &block_group->runtime_flags)) + goto next; + ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } +next: if (btrfs_should_end_transaction(trans)) { btrfs_end_transaction(trans); trans = btrfs_start_transaction(free_space_root, 1); @@ -1390,6 +1407,29 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); + /* + * While rebuilding the free space tree we may allocate new metadata + * block groups while modifying the free space tree. + * + * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we + * can use multiple transactions, every time btrfs_end_transaction() is + * called at btrfs_rebuild_free_space_tree() we finish the creation of + * new block groups by calling btrfs_create_pending_block_groups(), and + * that in turn calls us, through add_block_group_free_space(), to add + * a free space info item and a free space extent item for the block + * group. + * + * Then later btrfs_rebuild_free_space_tree() may find such new block + * groups and processes them with populate_free_space_tree(), which can + * fail with EEXIST since there are already items for the block group in + * the free space tree. Notice that we say "may find" because a new + * block group may be added to the block groups rbtree in a node before + * or after the block group currently being processed by the rebuild + * process. So signal the rebuild process to skip such new block groups + * if it finds them. + */ + set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags); + ret = add_new_free_space_info(trans, block_group, path); if (ret) return ret; From 6561a40ceced9082f50c374a22d5966cf9fc5f5c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Jun 2025 16:57:07 +0100 Subject: [PATCH 2/7] btrfs: fix missing error handling when searching for inode refs during log replay During log replay, at __add_inode_ref(), when we are searching for inode ref keys we totally ignore if btrfs_search_slot() returns an error. This may make a log replay succeed when there was an actual error and leave some metadata inconsistency in a subvolume tree. Fix this by checking if an error was returned from btrfs_search_slot() and if so, return it to the caller. Fixes: e02119d5a7b4 ("Btrfs: Add a write ahead tree log to optimize synchronous operations") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 858b609e292c..8b66173d9023 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1073,7 +1073,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { + if (ret < 0) { + return ret; + } else if (ret == 0) { struct btrfs_inode_ref *victim_ref; unsigned long ptr; unsigned long ptr_end; From 54a7081ed168b72a8a2d6ef4ba3a1259705a2926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 23 Jun 2025 12:11:58 +0100 Subject: [PATCH 3/7] btrfs: fix iteration of extrefs during log replay At __inode_add_ref() when processing extrefs, if we jump into the next label we have an undefined value of victim_name.len, since we haven't initialized it before we did the goto. This results in an invalid memory access in the next iteration of the loop since victim_name.len was not initialized to the length of the name of the current extref. Fix this by initializing victim_name.len with the current extref's name length. Fixes: e43eec81c516 ("btrfs: use struct qstr instead of name and namelen pairs") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8b66173d9023..d514bf531b3b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1148,13 +1148,13 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct fscrypt_str victim_name; extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); + victim_name.len, &victim_name); if (ret) return ret; From 5f61b961599acbd2bed028d3089105a1f7d224b8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Jun 2025 15:58:31 +0100 Subject: [PATCH 4/7] btrfs: fix inode lookup error handling during log replay When replaying log trees we use read_one_inode() to get an inode, which is just a wrapper around btrfs_iget_logging(), which in turn is a wrapper for btrfs_iget(). But read_one_inode() always returns NULL for any error that btrfs_iget_logging() / btrfs_iget() may return and this is a problem because: 1) In many callers of read_one_inode() we convert the NULL into -EIO, which is not accurate since btrfs_iget() may return -ENOMEM and -ENOENT for example, besides -EIO and other errors. So during log replay we may end up reporting a false -EIO, which is confusing since we may not have had any IO error at all; 2) When replaying directory deletes, at replay_dir_deletes(), we assume the NULL returned from read_one_inode() means that the inode doesn't exist and then proceed as if no error had happened. This is wrong because unless btrfs_iget() returned ERR_PTR(-ENOENT), we had an actual error and the target inode may exist in the target subvolume root - this may later result in the log replay code failing at a later stage (if we are "lucky") or succeed but leaving some inconsistency in the filesystem. So fix this by not ignoring errors from btrfs_iget_logging() and as a consequence remove the read_one_inode() wrapper and just use btrfs_iget_logging() directly. Also since btrfs_iget_logging() is supposed to be called only against subvolume roots, just like read_one_inode() which had a comment about it, add an assertion to btrfs_iget_logging() to check that the target root corresponds to a subvolume root. Fixes: 5d4f98a28c7d ("Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 127 +++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 65 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d514bf531b3b..8cf5e5ae593c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r unsigned int nofs_flag; struct btrfs_inode *inode; + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(is_fstree(btrfs_root_id(root))); + /* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply @@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct btrfs_inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -674,9 +662,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, return -EUCLEAN; } - inode = read_one_inode(root, key->objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the @@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -1169,10 +1158,10 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, kfree(victim_name.name); return ret; } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + ret = PTR_ERR(victim_parent); + } else { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); @@ -1317,9 +1306,9 @@ static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } @@ -1391,15 +1380,17 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -1411,11 +1402,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); @@ -1684,9 +1677,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } @@ -1722,9 +1715,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *inode; struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; @@ -1763,14 +1756,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { iput(&inode->vfs_inode); - return -EIO; + return PTR_ERR(dir); } ret = btrfs_add_link(trans, dir, inode, name, 1, index); @@ -1847,9 +1840,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -2149,9 +2142,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2303,14 +2297,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2469,9 +2466,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } from = ALIGN(i_size_read(&inode->vfs_inode), From bf5bcf9a6fa070ec8a725b08db63fb1318f77366 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 19 Jun 2025 13:13:38 +0100 Subject: [PATCH 5/7] btrfs: record new subvolume in parent dir earlier to avoid dir logging races Instead of recording that a new subvolume was created in a directory after we add the entry do the directory, record it before adding the entry. This is to avoid races where after creating the entry and before recording the new subvolume in the directory (the call to btrfs_record_new_subvolume()), another task logs the directory, so we end up with a log tree where we logged a directory that has an entry pointing to a root that was not yet committed, resulting in an invalid entry if the log is persisted and replayed later due to a power failure or crash. Also state this requirement in the function comment for btrfs_record_new_subvolume(), similar to what we do for the btrfs_record_unlink_dir() and btrfs_record_snapshot_destroy(). Fixes: 45c4102f0d82 ("btrfs: avoid transaction commit on any fsync after subvolume creation") Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/tree-log.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1e8f7082239c..e6f7d24e3b47 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); - d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8cf5e5ae593c..cea8a7e9d6d3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7447,6 +7447,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) From c466e33e729a0ee017d10d919cba18f503853c60 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 20 Jun 2025 15:54:05 +0100 Subject: [PATCH 6/7] btrfs: propagate last_unlink_trans earlier when doing a rmdir In case the removed directory had a snapshot that was deleted, we are propagating its inode's last_unlink_trans to the parent directory after we removed the entry from the parent directory. This leaves a small race window where someone can log the parent directory after we removed the entry and before we updated last_unlink_trans, and as a result if we ever try to replay such a log tree, we will fail since we will attempt to remove a snapshot during log replay, which is currently not possible and results in the log replay (and mount) to fail. This is the type of failure described in commit 1ec9a1ae1e30 ("Btrfs: fix unreplayable log after snapshot delete + parent dir fsync"). So fix this by propagating the last_unlink_trans to the parent directory before we remove the entry from it. Fixes: 44f714dae50a ("Btrfs: improve performance on fsync against new inode after rename/unlink") Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 26d6ed170a19..5a5c43586adf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4710,7 +4710,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) @@ -4736,6 +4735,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out_notrans; } + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) + BTRFS_I(dir)->last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; @@ -4745,27 +4761,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!ret) { + if (!ret) btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } out: btrfs_end_transaction(trans); out_notrans: From 157501b0469969fc1ba53add5049575aadd79d80 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 20 Jun 2025 16:37:01 +0100 Subject: [PATCH 7/7] btrfs: use btrfs_record_snapshot_destroy() during rmdir We are setting the parent directory's last_unlink_trans directly which may result in a concurrent task starting to log the directory not see the update and therefore can log the directory after we removed a child directory which had a snapshot within instead of falling back to a transaction commit. Replaying such a log tree would result in a mount failure since we can't currently delete snapshots (and subvolumes) during log replay. This is the type of failure described in commit 1ec9a1ae1e30 ("Btrfs: fix unreplayable log after snapshot delete + parent dir fsync"). Fix this by using btrfs_record_snapshot_destroy() which updates the last_unlink_trans field while holding the inode's log_mutex lock. Fixes: 44f714dae50a ("Btrfs: improve performance on fsync against new inode after rename/unlink") Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5a5c43586adf..fc66872b4c74 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4750,7 +4750,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) * deletes for directory foo. */ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);