From 53e83031494715482d11d3e1394058a10d96f486 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Mon, 24 Nov 2025 11:53:05 +0800 Subject: [PATCH 001/137] btrfs: update comment for visit_node_for_delete() Drop the obsolete @refs parameter from the comment so the argument list matches the current function signature after commit f8c4d59de23c9 ("btrfs: drop unused parameter refs from visit_node_for_delete()"). Reviewed-by: Johannes Thumshirn Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e4cae34620d1..432e1de4436d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5263,7 +5263,6 @@ struct walk_control { * @root: the root we are currently deleting * @wc: the walk control for this deletion * @eb: the parent eb that we're currently visiting - * @refs: the number of refs for wc->level - 1 * @flags: the flags for wc->level - 1 * @slot: the slot in the eb that we're currently checking * From 8bfee251b7f5e54437e893c6daa964caded8a860 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Sat, 22 Nov 2025 14:00:43 +0800 Subject: [PATCH 002/137] btrfs: use true/false for boolean parameters in btrfs_inc_ref()/btrfs_dec_ref() Replace integer literals 0/1 with true/false when calling btrfs_inc_ref() and btrfs_dec_ref() to make the code self-documenting and avoid mixing bool/integer types. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 20 ++++++++++---------- fs/btrfs/extent-tree.c | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a48b4befbee7..4b06c3c74ad4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -292,11 +292,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, } if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, true); if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } else { - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, false); if (unlikely(ret)) btrfs_abort_transaction(trans, ret); } @@ -420,15 +420,15 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == btrfs_root_id(root) || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); + ret = btrfs_inc_ref(trans, root, buf, true); if (ret) return ret; if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); + ret = btrfs_dec_ref(trans, root, buf, false); if (ret) return ret; - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, true); if (ret) return ret; } @@ -439,21 +439,21 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } else { if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, true); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, false); if (ret) return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, true); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, false); if (ret) return ret; - ret = btrfs_dec_ref(trans, root, buf, 1); + ret = btrfs_dec_ref(trans, root, buf, true); if (ret) return ret; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 432e1de4436d..cc9f8a32f67b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5457,12 +5457,12 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); + ret = btrfs_inc_ref(trans, root, eb, true); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; } - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, false); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; @@ -5864,13 +5864,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - ret = btrfs_dec_ref(trans, root, eb, 1); + ret = btrfs_dec_ref(trans, root, eb, true); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } } else { - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, false); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); return ret; From a5eb9024368dac8cc6b317c1577c6348e5334243 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Sat, 22 Nov 2025 14:00:44 +0800 Subject: [PATCH 003/137] btrfs: simplify boolean argument for btrfs_inc_ref()/btrfs_dec_ref() Replace open-coded if/else blocks with the boolean directly and introduce local const bool variables, making the code shorter and easier to read. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 38 ++++++++++++-------------------------- fs/btrfs/extent-tree.c | 18 ++++++------------ 2 files changed, 18 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4b06c3c74ad4..e245b8c4c340 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -249,6 +249,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int ret = 0; int level; struct btrfs_disk_key disk_key; + const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID); u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && @@ -262,7 +263,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (is_reloc_root) reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, @@ -276,7 +277,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (is_reloc_root) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else btrfs_set_header_owner(cow, new_root_objectid); @@ -291,16 +292,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return ret; } - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_inc_ref(trans, root, cow, true); - if (unlikely(ret)) - btrfs_abort_transaction(trans, ret); - } else { - ret = btrfs_inc_ref(trans, root, cow, false); - if (unlikely(ret)) - btrfs_abort_transaction(trans, ret); - } - if (ret) { + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(cow); free_extent_buffer(cow); return ret; @@ -362,6 +356,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, u64 owner; u64 flags; int ret; + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); /* * Backrefs update rules: @@ -397,8 +392,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } } else { refs = 1; - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else flags = 0; @@ -417,14 +411,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } if (refs > 1) { - if ((owner == btrfs_root_id(root) || - btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && + if ((owner == btrfs_root_id(root) || is_reloc_root) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, true); if (ret) return ret; - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + if (is_reloc_root) { ret = btrfs_dec_ref(trans, root, buf, false); if (ret) return ret; @@ -437,20 +430,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } else { - - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, true); - else - ret = btrfs_inc_ref(trans, root, cow, false); + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); if (ret) return ret; } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, true); - else - ret = btrfs_inc_ref(trans, root, cow, false); + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); if (ret) return ret; ret = btrfs_dec_ref(trans, root, buf, true); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cc9f8a32f67b..04a266bb189b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5863,18 +5863,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - ret = btrfs_dec_ref(trans, root, eb, true); - if (ret) { - btrfs_abort_transaction(trans, ret); - return ret; - } - } else { - ret = btrfs_dec_ref(trans, root, eb, false); - if (unlikely(ret)) { - btrfs_abort_transaction(trans, ret); - return ret; - } + const bool full_backref = (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF); + + ret = btrfs_dec_ref(trans, root, eb, full_backref); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; } if (btrfs_is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); From 9c46bcda5f347febdbb4d117fb21a37ffcec5fa4 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 1 Dec 2025 15:33:49 -0800 Subject: [PATCH 004/137] btrfs: check squota parent usage on membership change We could have detected the quick inherit bug more directly if we had an extra warning about squota hierarchy consistency while modifying the hierarchy. In squotas, the parent usage always simply adds up to the sum of its children, so we can just check for that when changing membership and detect more accounting bugs. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 206587820fec..3a74759b59ee 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -346,6 +346,42 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid } #endif +static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *parent) +{ + u64 excl_sum = 0; + u64 rfer_sum = 0; + u64 excl_cmpr_sum = 0; + u64 rfer_cmpr_sum = 0; + struct btrfs_qgroup_list *glist; + int nr_members = 0; + bool mismatch; + + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return false; + if (btrfs_qgroup_level(parent->qgroupid) == 0) + return false; + + /* Eligible parent qgroup. Squota; level > 0; empty members list. */ + list_for_each_entry(glist, &parent->members, next_member) { + excl_sum += glist->member->excl; + rfer_sum += glist->member->rfer; + excl_cmpr_sum += glist->member->excl_cmpr; + rfer_cmpr_sum += glist->member->rfer_cmpr; + nr_members++; + } + mismatch = (parent->excl != excl_sum || parent->rfer != rfer_sum || + parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != excl_cmpr_sum); + + WARN(mismatch, + "parent squota qgroup %hu/%llu has mismatched usage from its %d members. " + "%llu %llu %llu %llu vs %llu %llu %llu %llu\n", + btrfs_qgroup_level(parent->qgroupid), + btrfs_qgroup_subvolid(parent->qgroupid), nr_members, parent->excl, + parent->rfer, parent->excl_cmpr, parent->rfer_cmpr, excl_sum, + rfer_sum, excl_cmpr_sum, rfer_cmpr_sum); + return mismatch; +} + __printf(2, 3) static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...) { @@ -1562,6 +1598,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst goto out; } ret = quick_update_accounting(fs_info, src, dst, 1); + squota_check_parent_usage(fs_info, parent); spin_unlock(&fs_info->qgroup_lock); out: kfree(prealloc); @@ -1618,6 +1655,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); ret = quick_update_accounting(fs_info, src, dst, -1); + ASSERT(parent); + squota_check_parent_usage(fs_info, parent); spin_unlock(&fs_info->qgroup_lock); } out: From adb0af40fe89fd42f1ef277bf60d9cfa7c2ae472 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 1 Dec 2025 15:35:02 -0800 Subject: [PATCH 005/137] btrfs: relax squota parent qgroup deletion rule Currently, with squotas, we do not allow removing a parent qgroup with no members if it still has usage accounted to it. This makes it really difficult to recover from accounting bugs, as we have no good way of getting back to 0 usage. Instead, allow deletion (it's safe at 0 members..) while still warning about the inconsistency by adding a squota parent check. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 50 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3a74759b59ee..ae4a1b76646c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1718,6 +1718,36 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) return ret; } +static bool can_delete_parent_qgroup(struct btrfs_qgroup *qgroup) + +{ + ASSERT(btrfs_qgroup_level(qgroup->qgroupid)); + return list_empty(&qgroup->members); +} + +/* + * Return true if we can delete the squota qgroup and false otherwise. + * + * Rules for whether we can delete: + * + * A subvolume qgroup can be removed iff the subvolume is fully deleted, which + * is iff there is 0 usage in the qgroup. + * + * A higher level qgroup can be removed iff it has no members. + * Note: We audit its usage to warn on inconsitencies without blocking deletion. + */ +static bool can_delete_squota_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) +{ + ASSERT(btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); + + if (btrfs_qgroup_level(qgroup->qgroupid) > 0) { + squota_check_parent_usage(fs_info, qgroup); + return can_delete_parent_qgroup(qgroup); + } + + return !(qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr); +} + /* * Return 0 if we can not delete the qgroup (not empty or has children etc). * Return >0 if we can delete the qgroup. @@ -1728,23 +1758,13 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); - /* - * Squota would never be inconsistent, but there can still be case - * where a dropped subvolume still has qgroup numbers, and squota - * relies on such qgroup for future accounting. - * - * So for squota, do not allow dropping any non-zero qgroup. - */ - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && - (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) - return 0; + /* Since squotas cannot be inconsistent, they have special rules for deletion. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return can_delete_squota_qgroup(fs_info, qgroup); /* For higher level qgroup, we can only delete it if it has no child. */ - if (btrfs_qgroup_level(qgroup->qgroupid)) { - if (!list_empty(&qgroup->members)) - return 0; - return 1; - } + if (btrfs_qgroup_level(qgroup->qgroupid)) + return can_delete_parent_qgroup(qgroup); /* * For level-0 qgroups, we can only delete it if it has no subvolume From b39b26e017c7889181cb84032e22bef72e81cf29 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Dec 2025 13:42:23 +0100 Subject: [PATCH 006/137] btrfs: zoned: don't zone append to conventional zone In case of a zoned RAID, it can happen that a data write is targeting a sequential write required zone and a conventional zone. In this case the bio will be marked as REQ_OP_ZONE_APPEND but for the conventional zone, this needs to be REQ_OP_WRITE. The setting of REQ_OP_ZONE_APPEND is deferred to the last possible time in btrfs_submit_dev_bio(), but the decision if we can use zone append is cached in btrfs_bio. CC: Naohiro Aota Fixes: e9b9b911e03c ("btrfs: add raid stripe tree to features enabled with debug config") Reviewed-by: Christoph Hellwig Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/bio.c | 19 +++++++++---------- fs/btrfs/bio.h | 3 +++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fa1d321a2fb8..e4d382d3a7ae 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -480,6 +480,8 @@ static void btrfs_clone_write_end_io(struct bio *bio) static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || (btrfs_op(bio) == BTRFS_MAP_WRITE && @@ -494,12 +496,13 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) * For zone append writing, bi_sector must point the beginning of the * zone */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) { u64 zone_start = round_down(physical, dev->fs_info->zone_size); ASSERT(btrfs_dev_is_sequential(dev, physical)); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; } btrfs_debug(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -747,7 +750,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; - bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; blk_status_t status; @@ -775,8 +777,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) bbio->orig_logical = logical; + bbio->can_use_append = btrfs_use_zone_append(bbio); + map_length = min(map_length, length); - if (use_append) + if (bbio->can_use_append) map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { @@ -805,11 +809,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (use_append) { - bio->bi_opf &= ~REQ_OP_WRITE; - bio->bi_opf |= REQ_OP_ZONE_APPEND; - } - if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to @@ -836,7 +835,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) status = errno_to_blk_status(ret); if (status) goto fail; - } else if (use_append || + } else if (bbio->can_use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 1be74209f0b8..246c7519dff3 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -92,6 +92,9 @@ struct btrfs_bio { /* Whether the csum generation for data write is async. */ bool async_csum; + /* Whether the bio is written using zone append. */ + bool can_use_append; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. From fe11ac191ce0ad910f6fda0c628bcff19fcff47d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Dec 2025 23:04:54 -0800 Subject: [PATCH 007/137] btrfs: switch to library APIs for checksums Make btrfs use the library APIs instead of crypto_shash, for all checksum computations. This has many benefits: - Allows future checksum types, e.g. XXH3 or CRC64, to be more easily supported. Only a library API will be needed, not crypto_shash too. - Eliminates the overhead of the generic crypto layer, including an indirect call for every function call and other API overhead. A microbenchmark of btrfs_check_read_bio() with crc32c checksums shows a speedup from 658 cycles to 608 cycles per 4096-byte block. - Decreases the stack usage of btrfs by reducing the size of checksum contexts from 384 bytes to 240 bytes, and by eliminating the need for some functions to declare a checksum context at all. - Increases reliability. The library functions always succeed and return void. In contrast, crypto_shash can fail and return errors. Also, the library functions are guaranteed to be available when btrfs is loaded; there's no longer any need to use module softdeps to try to work around the crypto modules sometimes not being loaded. - Fixes a bug where blake2b checksums didn't work on kernels booted with fips=1. Since btrfs checksums are for integrity only, it's fine for them to use non-FIPS-approved algorithms. Note that with having to handle 4 algorithms instead of just 1-2, this commit does result in a slightly positive diffstat. That being said, this wouldn't have been the case if btrfs had actually checked for errors from crypto_shash, which technically it should have been doing. Reviewed-by: Ard Biesheuvel Reviewed-by: Neal Gompa Signed-off-by: Eric Biggers Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 8 ++-- fs/btrfs/compression.c | 1 - fs/btrfs/disk-io.c | 68 +++++++-------------------- fs/btrfs/file-item.c | 4 -- fs/btrfs/fs.c | 102 +++++++++++++++++++++++++++++++++++------ fs/btrfs/fs.h | 23 ++++++++-- fs/btrfs/inode.c | 10 ++-- fs/btrfs/scrub.c | 16 +++---- fs/btrfs/super.c | 4 -- fs/btrfs/sysfs.c | 6 +-- 10 files changed, 138 insertions(+), 104 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 6d6fc85835d4..d88eb836a193 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,11 +4,8 @@ config BTRFS_FS tristate "Btrfs filesystem support" select BLK_CGROUP_PUNT_BIO select CRC32 - select CRYPTO - select CRYPTO_CRC32C - select CRYPTO_XXHASH - select CRYPTO_SHA256 - select CRYPTO_BLAKE2B + select CRYPTO_LIB_BLAKE2B + select CRYPTO_LIB_SHA256 select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS @@ -18,6 +15,7 @@ config BTRFS_FS select FS_IOMAP select RAID6_PQ select XOR_BLOCKS + select XXHASH depends on PAGE_SIZE_LESS_THAN_256KB help diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6b3357287b42..4323d4172c7b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -21,7 +21,6 @@ #include #include #include -#include #include "misc.h" #include "ctree.h" #include "fs.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2833b44f4b4f..12d91407bb60 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -62,12 +61,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) -{ - if (fs_info->csum_shash) - crypto_free_shash(fs_info->csum_shash); -} - /* * Compute the csum of a btree block and store the result to provided buffer. */ @@ -76,12 +69,11 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) struct btrfs_fs_info *fs_info = buf->fs_info; int num_pages; u32 first_page_part; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_csum_ctx csum; char *kaddr; int i; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); + btrfs_csum_init(&csum, fs_info->csum_type); if (buf->addr) { /* Pages are contiguous, handle them as a big one. */ @@ -94,21 +86,21 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) num_pages = num_extent_pages(buf); } - crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - first_page_part - BTRFS_CSUM_SIZE); + btrfs_csum_update(&csum, kaddr + BTRFS_CSUM_SIZE, + first_page_part - BTRFS_CSUM_SIZE); /* * Multiple single-page folios case would reach here. * * nodesize <= PAGE_SIZE and large folio all handled by above - * crypto_shash_update() already. + * btrfs_csum_update() already. */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = folio_address(buf->folios[i]); - crypto_shash_update(shash, kaddr, PAGE_SIZE); + btrfs_csum_update(&csum, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); + btrfs_csum_final(&csum, result); } /* @@ -160,18 +152,15 @@ static bool btrfs_supported_super_csum(u16 csum_type) int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb) { - char result[BTRFS_CSUM_SIZE]; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - - shash->tfm = fs_info->csum_shash; + u8 result[BTRFS_CSUM_SIZE]; /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); + btrfs_csum(fs_info->csum_type, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; @@ -1229,7 +1218,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) ASSERT(percpu_counter_sum_positive(em_counter) == 0); percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); - btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); @@ -1983,21 +1971,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) return 0; } -static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +static void btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) { - struct crypto_shash *csum_shash; - const char *csum_driver = btrfs_super_csum_driver(csum_type); - - csum_shash = crypto_alloc_shash(csum_driver, 0, 0); - - if (IS_ERR(csum_shash)) { - btrfs_err(fs_info, "error allocating %s hash for checksum", - csum_driver); - return PTR_ERR(csum_shash); - } - - fs_info->csum_shash = csum_shash; - /* Check if the checksum implementation is a fast accelerated one. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: @@ -2011,10 +1986,8 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) break; } - btrfs_info(fs_info, "using %s (%s) checksum algorithm", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(csum_shash)); - return 0; + btrfs_info(fs_info, "using %s checksum algorithm", + btrfs_super_csum_name(csum_type)); } static int btrfs_replay_log(struct btrfs_fs_info *fs_info, @@ -3302,12 +3275,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } fs_info->csum_size = btrfs_super_csum_size(disk_super); + fs_info->csum_type = csum_type; - ret = btrfs_init_csum_hash(fs_info, csum_type); - if (ret) { - btrfs_release_disk_super(disk_super); - goto fail_alloc; - } + btrfs_init_csum_hash(fs_info, csum_type); /* * We want to check superblock checksum, the type is stored inside. @@ -3709,7 +3679,6 @@ static int write_dev_supers(struct btrfs_device *device, { struct btrfs_fs_info *fs_info = device->fs_info; struct address_space *mapping = device->bdev->bd_mapping; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int ret; u64 bytenr, bytenr_orig; @@ -3719,8 +3688,6 @@ static int write_dev_supers(struct btrfs_device *device, if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - shash->tfm = fs_info->csum_shash; - for (i = 0; i < max_mirrors; i++) { struct folio *folio; struct bio *bio; @@ -3744,9 +3711,8 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr_orig); - crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, - sb->csum); + btrfs_csum(fs_info->csum_type, (const u8 *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 14e5257f0f04..568f0e0ebdf6 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -8,7 +8,6 @@ #include #include #include -#include #include "messages.h" #include "ctree.h" #include "disk-io.h" @@ -769,7 +768,6 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums = bbio->sums; struct bvec_iter iter = *src; @@ -781,8 +779,6 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) u32 offset = 0; int index = 0; - shash->tfm = fs_info->csum_shash; - btrfs_bio_for_each_block(paddr, bio, &iter, step) { paddrs[(offset / step) % nr_steps] = paddr; offset += step; diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index feb0a2faa837..14d83565cdee 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include "messages.h" #include "fs.h" #include "accessors.h" @@ -8,13 +9,11 @@ static const struct btrfs_csums { u16 size; const char name[10]; - const char driver[12]; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", - .driver = "blake2b-256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b" }, }; /* This exists for btrfs-progs usages. */ @@ -37,23 +36,96 @@ const char *btrfs_super_csum_name(u16 csum_type) return btrfs_csums[csum_type].name; } -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name. - */ -const char *btrfs_super_csum_driver(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver[0] ? - btrfs_csums[csum_type].driver : - btrfs_csums[csum_type].name; -} - size_t __attribute_const__ btrfs_get_num_csums(void) { return ARRAY_SIZE(btrfs_csums); } +void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out) +{ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + put_unaligned_le32(~crc32c(~0, data, len), out); + break; + case BTRFS_CSUM_TYPE_XXHASH: + put_unaligned_le64(xxh64(data, len, 0), out); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256(data, len, out); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b(NULL, 0, data, len, out, 32); + break; + default: + /* Checksum type is validated at mount time. */ + BUG(); + } +} + +void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type) +{ + ctx->csum_type = csum_type; + switch (ctx->csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + ctx->crc32 = ~0; + break; + case BTRFS_CSUM_TYPE_XXHASH: + xxh64_reset(&ctx->xxh64, 0); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256_init(&ctx->sha256); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b_init(&ctx->blake2b, 32); + break; + default: + /* Checksume type is validated at mount time. */ + BUG(); + } +} + +void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len) +{ + switch (ctx->csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + ctx->crc32 = crc32c(ctx->crc32, data, len); + break; + case BTRFS_CSUM_TYPE_XXHASH: + xxh64_update(&ctx->xxh64, data, len); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256_update(&ctx->sha256, data, len); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b_update(&ctx->blake2b, data, len); + break; + default: + /* Checksum type is validated at mount time. */ + BUG(); + } +} + +void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out) +{ + switch (ctx->csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + put_unaligned_le32(~ctx->crc32, out); + break; + case BTRFS_CSUM_TYPE_XXHASH: + put_unaligned_le64(xxh64_digest(&ctx->xxh64), out); + break; + case BTRFS_CSUM_TYPE_SHA256: + sha256_final(&ctx->sha256, out); + break; + case BTRFS_CSUM_TYPE_BLAKE2: + blake2b_final(&ctx->blake2b, out); + break; + default: + /* Checksum type is validated at mount time. */ + BUG(); + } +} + /* * We support the following block sizes for all systems: * diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8ffbc40ebe45..458a24206935 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,8 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include +#include #include #include #include @@ -24,6 +26,7 @@ #include #include #include +#include #include #include #include "extent-io-tree.h" @@ -35,7 +38,6 @@ struct inode; struct super_block; struct kobject; struct reloc_control; -struct crypto_shash; struct ulist; struct btrfs_device; struct btrfs_block_group; @@ -850,9 +852,10 @@ struct btrfs_fs_info { u32 sectorsize_bits; u32 block_min_order; u32 block_max_order; + u32 stripesize; u32 csum_size; u32 csums_per_leaf; - u32 stripesize; + u32 csum_type; /* * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular @@ -864,8 +867,6 @@ struct btrfs_fs_info { spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; - struct crypto_shash *csum_shash; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; @@ -1057,8 +1058,20 @@ int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); +struct btrfs_csum_ctx { + u16 csum_type; + union { + u32 crc32; + struct xxh64_state xxh64; + struct sha256_ctx sha256; + struct blake2b_ctx blake2b; + }; +}; +void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out); +void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type); +void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len); +void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out); static inline bool btrfs_is_empty_uuid(const u8 *uuid) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2b5b440637e..5dceb03bee0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3,7 +3,6 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include #include #include #include @@ -3417,20 +3416,19 @@ void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, const u32 blocksize = fs_info->sectorsize; const u32 step = min(blocksize, PAGE_SIZE); const u32 nr_steps = blocksize / step; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_csum_ctx csum; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); + btrfs_csum_init(&csum, fs_info->csum_type); for (int i = 0; i < nr_steps; i++) { const phys_addr_t paddr = paddrs[i]; void *kaddr; ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE); kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); - crypto_shash_update(shash, kaddr, step); + btrfs_csum_update(&csum, kaddr, step); kunmap_local(kaddr); } - crypto_shash_final(shash, dest); + btrfs_csum_final(&csum, dest); } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a40ee41f42c6..1a60e631d801 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -6,7 +6,6 @@ #include #include #include -#include #include "ctree.h" #include "discard.h" #include "volumes.h" @@ -718,7 +717,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); struct btrfs_header *header = first_kaddr; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct btrfs_csum_ctx csum; u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; @@ -760,17 +759,16 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } /* Now check tree block csum. */ - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, - fs_info->sectorsize - BTRFS_CSUM_SIZE); + btrfs_csum_init(&csum, fs_info->csum_type); + btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, + fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { - crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), - fs_info->sectorsize); + btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), + fs_info->sectorsize); } - crypto_shash_final(shash, calculated_csum); + btrfs_csum_final(&csum, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index af56fdbba65d..0a931555e6dc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2700,7 +2700,3 @@ module_exit(exit_btrfs_fs) MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: xxhash64"); -MODULE_SOFTDEP("pre: sha256"); -MODULE_SOFTDEP("pre: blake2b-256"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4b3c2acac51a..f0974f4c0ae4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "messages.h" #include "ctree.h" #include "discard.h" @@ -1253,10 +1252,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); + const char *csum_name = btrfs_super_csum_name(csum_type); - return sysfs_emit(buf, "%s (%s)\n", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(fs_info->csum_shash)); + return sysfs_emit(buf, "%s (%s-lib)\n", csum_name, csum_name); } BTRFS_ATTR(, checksum, btrfs_checksum_show); From ddea91780fecd349eb4b2c4036bfbf1ab5f75321 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Dec 2025 07:43:58 +1030 Subject: [PATCH 008/137] btrfs: enable direct IO for bs > ps cases Previously direct IO was disabled if the fs block size was larger than the page size, the reasons are: - Iomap direct IO can split the range ignoring the fs block alignment Which could trigger the bio size check from btrfs_submit_bio(). - The buffer is only ensured to be contiguous in user space memory The underlying physical memory is not ensured to be contiguous, and that can cause problems for the checksum generation/verification and RAID56 handling. However the above problems are solved by the following upstream commits: - 001397f5ef49 ("iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag") Which added an extra flag that can be utilized by the fs to ensure the bio submitted by iomap is always aligned to fs block size. - ec20799064c8 ("btrfs: enable encoded read/write/send for bs > ps cases") - 8870dbeedcf9 ("btrfs: raid56: enable bs > ps support") Which makes btrfs to handle bios that are not backed by large folios but still are aligned to fs block size. As the commits have been merged we can enable direct IO support for bs > ps cases. Reviewed-by: Neal Gompa Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/direct-io.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 07e19e88ba4b..bc7cc2d81f8f 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -763,7 +763,7 @@ static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); } static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, @@ -772,7 +772,7 @@ static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *it struct btrfs_dio_data data = { 0 }; return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -785,19 +785,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; - - /* - * For bs > ps support, we heavily rely on large folios to make sure no - * block will cross large folio boundaries. - * - * But memory provided by direct IO is only virtually contiguous, not - * physically contiguous, and will break the btrfs' large folio requirement. - * - * So for bs > ps support, all direct IOs should fallback to buffered ones. - */ - if (fs_info->sectorsize > PAGE_SIZE) - return -EINVAL; - return 0; } From fab0c0f03cfd0dfe793889e3374ebc68ecf18889 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 26 Nov 2025 08:20:21 +1030 Subject: [PATCH 009/137] btrfs: introduce BTRFS_PATH_AUTO_RELEASE() helper There are already several bugs with on-stack btrfs_path involved, even it is already a little safer than btrfs_path pointers (only leaks the extent buffers, not the btrfs_path structure itself) - Patch "btrfs: make sure extent and csum paths are always released in scrub_raid56_parity_stripe()" - Patch "btrfs: fix a potential path leak in print_data_reloc_error()" Thus there is a real need to apply auto release for those on-stack paths. Introduces a new macro, BTRFS_PATH_AUTO_RELEASE() which defines one on-stack btrfs_path structure, initialize it all to 0, then call btrfs_release_path() on it when exiting the scope. This applies to current 3 on-stack path usages: - defrag_get_extent() in defrag.c - print_data_reloc_error() in inode.c There is a special case where we want to release the path early before the time consuming iterate_extent_inodes() call, thus that manual early release is kept as is, with an extra comment added. - scrub_radi56_parity_stripe() in scrub.c Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/defrag.c | 5 +---- fs/btrfs/inode.c | 8 +++++--- fs/btrfs/scrub.c | 23 ++++++++--------------- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 692370fc07b2..6de7ad191e04 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -85,6 +85,14 @@ struct btrfs_path { #define BTRFS_PATH_AUTO_FREE(path_name) \ struct btrfs_path *path_name __free(btrfs_free_path) = NULL +/* + * This defines an on-stack path that will be auto released when exiting the scope. + * + * It is compatible with any existing manual btrfs_release_path() calls. + */ +#define BTRFS_PATH_AUTO_RELEASE(path_name) \ + struct btrfs_path path_name __free(btrfs_release_path) = { 0 } + /* * The state of btrfs root */ @@ -601,6 +609,7 @@ void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) +DEFINE_FREE(btrfs_release_path, struct btrfs_path, btrfs_release_path(&_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b81e224d4a27..bcc6656ad034 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -609,7 +609,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, { struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; - struct btrfs_path path = { 0 }; + BTRFS_PATH_AUTO_RELEASE(path); struct extent_map *em; struct btrfs_key key; u64 ino = btrfs_ino(inode); @@ -720,16 +720,13 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, if (ret > 0) goto not_found; } - btrfs_release_path(&path); return em; not_found: - btrfs_release_path(&path); btrfs_free_extent_map(em); return NULL; err: - btrfs_release_path(&path); btrfs_free_extent_map(em); return ERR_PTR(ret); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5dceb03bee0a..912343fc9a73 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -217,7 +217,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off int mirror_num) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_path path = { 0 }; + BTRFS_PATH_AUTO_RELEASE(path); struct btrfs_key found_key = { 0 }; struct extent_buffer *eb; struct btrfs_extent_item *ei; @@ -255,7 +255,6 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off if (ret < 0) { btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", logical, ret); - btrfs_release_path(&path); return; } eb = path.nodes[0]; @@ -285,11 +284,14 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off (ref_level ? "node" : "leaf"), ref_level, ref_root); } - btrfs_release_path(&path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; struct data_reloc_warn reloc_warn = { 0 }; + /* + * Do not hold the path as later iterate_extent_inodes() call + * can be time consuming. + */ btrfs_release_path(&path); ctx.bytenr = found_key.objectid; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1a60e631d801..2372084cf6c5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2171,8 +2171,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, u64 full_stripe_start) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_path extent_path = { 0 }; - struct btrfs_path csum_path = { 0 }; + BTRFS_PATH_AUTO_RELEASE(extent_path); + BTRFS_PATH_AUTO_RELEASE(csum_path); struct scrub_stripe *stripe; bool all_empty = true; const int data_stripes = nr_data_stripes(map); @@ -2224,7 +2224,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); if (ret < 0) - goto out; + return ret; /* * No extent in this data stripe, need to manually mark them * initialized to make later read submission happy. @@ -2246,10 +2246,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, break; } } - if (all_empty) { - ret = 0; - goto out; - } + if (all_empty) + return 0; for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; @@ -2290,20 +2288,15 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); - ret = -EIO; - goto out; + return ret; } bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ - ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, - &extent_bitmap); -out: - btrfs_release_path(&extent_path); - btrfs_release_path(&csum_path); - return ret; + return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, + &extent_bitmap); } /* From 3970da5c3ba0993eba8b83c63726f61ef97e3ae7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 11 Dec 2025 16:55:23 +1030 Subject: [PATCH 010/137] btrfs: search for larger extent maps inside btrfs_do_readpage() [CORNER CASE] If we have the following file extents layout, btrfs_get_extent() can return a smaller hole during read, and cause unnecessary extra tree searches: item 6 key (257 EXTENT_DATA 0) itemoff 15810 itemsize 53 generation 9 type 1 (regular) extent data disk byte 13631488 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) item 7 key (257 EXTENT_DATA 32768) itemoff 15757 itemsize 53 generation 9 type 1 (regular) extent data disk byte 13635584 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression 0 (none) In above case, range [0, 4K) and [32K, 36K) are regular extents, and there is a hole in range [4K, 32K), and the fs has "no-holes" feature, meaning the hole will not have a file extent item. [INEFFICIENCY] Assume the system has 4K page size, and we're doing readahead for range [4K, 32K), no large folio yet. btrfs_readahead() for range [4K, 32K) |- btrfs_do_readpage() for folio 4K | |- get_extent_map() for range [4K, 8K) | |- btrfs_get_extent() for range [4K, 8K) | We hit item 6, then for the next item 7. | At this stage we know range [4K, 32K) is a hole. | But our search range is only [4K, 8K), not reaching 32K, thus | we go into not_found: tag, returning a hole em for [4K, 8K). | |- btrfs_do_readpage() for folio 8K | |- get_extent_map() for range [8K, 12K) | |- btrfs_get_extent() for range [8K, 12K) | We hit the same item 6, and then item 7. | But still we goto not_found tag, inserting a new hole em, | which will be merged with previous one. | | [ Repeat the same btrfs_get_extent() calls until the end ] So we're calling btrfs_get_extent() again and again, just for a different part of the same hole range [4K, 32K). [ENHANCEMENT] Make btrfs_do_readpage() to search for a larger extent map if readahead is involved. For btrfs_readahead() we have bio_ctrl::ractl set, and lock extents for the whole readahead range. If we find bio_ctrl::ractl is set, we can use that end range as extent map search end, this allows btrfs_get_extent() to return a much larger hole, thus reduce the need to call btrfs_get_extent() again and again. btrfs_readahead() for range [4K, 32K) |- btrfs_do_readpage() for folio 4K | |- get_extent_map() for range [4K, 32K) | |- btrfs_get_extent() for range [4K, 32K) | We hit item 6, then for the next item 7. | At this stage we know range [4K, 32K) is a hole. | So the hole em for range [4K, 32K) is returned. | |- btrfs_do_readpage() for folio 8K | |- get_extent_map() for range [8K, 32K) | The cached hole em range [4K, 32K) covers the range, | and reuse that em. | | [ Repeat the same btrfs_get_extent() calls until the end ] Now we only call btrfs_get_extent() once for the whole range [4K, 32K), other than the old 8 times. Such change will reduce the overhead of reading large holes a little. For current experimental build (with larger folios) on aarch64, there will be a tiny but consistent ~1% improvement reading a large hole file: Reading a 1GiB sparse file (all hole) using xfs_io, with 64K block size, the result is the time needed to read the whole file, reported from xfs_io. 32 runs, experimental build (with large folios). 64K page size, 4K fs block size. - Avg before: 0.20823 s - Avg after: 0.20635 s - Diff: -0.9% Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f6cca3c97166..17a6b01562cd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -998,11 +998,17 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; u64 extent_offset; + u64 locked_end; u64 last_byte = i_size_read(inode); struct extent_map *em; int ret = 0; const size_t blocksize = fs_info->sectorsize; + if (bio_ctrl->ractl) + locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1; + else + locked_end = end; + ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); @@ -1036,7 +1042,14 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, blocksize); continue; } - em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); + /* + * Search extent map for the whole locked range. + * This will allow btrfs_get_extent() to return a larger hole + * when possible. + * This can reduce duplicated btrfs_get_extent() calls for large + * holes. + */ + em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); From 44820d80026e0b509007d41c83d42f1213ee8589 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Dec 2025 19:02:33 +1030 Subject: [PATCH 011/137] btrfs: concentrate the error handling of submit_one_sector() Currently submit_one_sector() has only one failure path from btrfs_get_extent(). However the error handling is split into two parts, one inside submit_one_sector(), which clears the dirty flag and finishes the writeback for the fs block. The other part is to submit any remaining bio inside bio_ctrl and mark the ordered extent finished for the fs block. There is no special reason that we must split the error handling, let's just concentrate all the error handling into submit_one_sector(). Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 17a6b01562cd..9d4a95e4e2e7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1611,7 +1611,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, /* * Return 0 if we have submitted or queued the sector for submission. - * Return <0 for critical errors, and the sector will have its dirty flag cleared. + * Return <0 for critical errors, and the involved sector will be cleaned up. * * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ @@ -1635,6 +1635,13 @@ static int submit_one_sector(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) { + /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + /* * When submission failed, we should still clear the folio dirty. * Or the folio will be written back again but without any @@ -1643,6 +1650,13 @@ static int submit_one_sector(struct btrfs_inode *inode, btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); + + /* + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, filepos, + fs_info->sectorsize, false); return PTR_ERR(em); } @@ -1769,19 +1783,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); if (unlikely(ret < 0)) { - /* - * bio_ctrl may contain a bio crossing several folios. - * Submit it immediately so that the bio has a chance - * to finish normally, other than marked as error. - */ - submit_one_bio(bio_ctrl); - /* - * Failed to grab the extent map which should be very rare. - * Since there is no bio submitted to finish the ordered - * extent, we have to manually finish this sector. - */ - btrfs_mark_ordered_io_finished(inode, folio, cur, - fs_info->sectorsize, false); if (!found_error) found_error = ret; continue; From e6698b34fab33867ef3faeeea6feb165f31aae24 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Dec 2025 19:02:34 +1030 Subject: [PATCH 012/137] btrfs: replace for_each_set_bit() with for_each_set_bitmap() Inside extent_io.c, there are several simple call sites doing things like: for_each_set_bit(bit, bitmap, bitmap_size) { /* handle one fs block */ } The workload includes: - set_bit() Inside extent_writepage_io(). This can be replaced with a bitmap_set(). - btrfs_folio_set_lock() - btrfs_mark_ordered_io_finished() Inside writepage_delalloc(). Instead of calling it multiple times, we can pass a range into the function with one call. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9d4a95e4e2e7..f804131b1c78 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1439,8 +1439,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; + unsigned int start_bit; + unsigned int end_bit; int ret = 0; - int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, folio)) { @@ -1450,10 +1451,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { - u64 start = page_start + (bit << fs_info->sectorsize_bits); + for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, + blocks_per_folio) { + u64 start = page_start + (start_bit << fs_info->sectorsize_bits); + u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; - btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + btrfs_folio_set_lock(fs_info, folio, start, len); } /* Lock all (subpage) delalloc ranges inside the folio first. */ @@ -1570,10 +1573,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, fs_info->sectorsize_bits, blocks_per_folio); - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) - btrfs_mark_ordered_io_finished(inode, folio, - page_start + (bit << fs_info->sectorsize_bits), - fs_info->sectorsize, false); + for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, + bitmap_size) { + u64 start = page_start + (start_bit << fs_info->sectorsize_bits); + u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; + + btrfs_mark_ordered_io_finished(inode, folio, start, len, false); + } return ret; } out: @@ -1741,8 +1747,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return ret; } - for (cur = start; cur < end; cur += fs_info->sectorsize) - set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits, + len >> fs_info->sectorsize_bits); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, blocks_per_folio); From d7a5d511c098ed3e67912272c938627f04a1a9ad Mon Sep 17 00:00:00 2001 From: Massimiliano Pellizzer Date: Thu, 4 Dec 2025 22:09:59 +0100 Subject: [PATCH 013/137] btrfs: remove dead assignment in prepare_one_folio() In prepare_one_folio(), ret is initialized to 0 at declaration, and in an error path we assign ret = 0 before jumping to the again label to retry the operation. However, ret is immediately overwritten by ret = set_folio_extent_mapped(folio) after the again label. Both assignments are never observed by any code path, therefore they can be safely removed. Signed-off-by: Massimiliano Pellizzer Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1abc7ed2990e..87425a243040 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -859,7 +859,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | fgf_set_order(write_bytes); struct folio *folio; - int ret = 0; + int ret; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); @@ -876,10 +876,8 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ if (ret) { /* The folio is already unlocked. */ folio_put(folio); - if (!nowait && ret == -EAGAIN) { - ret = 0; + if (!nowait && ret == -EAGAIN) goto again; - } return ret; } *folio_ret = folio; From 4b117be65ff41efae3694df449b9badb4e9d142e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 9 Dec 2025 20:06:49 +0100 Subject: [PATCH 014/137] btrfs: merge setting ret and return ret In many places we have pattern: ret = ...; return ret; This can be simplified to a direct return, removing 'ret' if not otherwise needed. The places in self tests are not converted so we can add more test cases without changing surrounding code (extent-map-tests.c:test_case_4()). Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 +-- fs/btrfs/ctree.c | 3 +-- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 13 +++++-------- fs/btrfs/extent-io-tree.c | 7 ++----- fs/btrfs/extent-tree.c | 10 +++------- fs/btrfs/file.c | 3 +-- fs/btrfs/free-space-tree.c | 4 +--- fs/btrfs/inode-item.c | 7 +++---- fs/btrfs/inode.c | 13 ++++--------- fs/btrfs/qgroup.c | 14 ++++---------- fs/btrfs/relocation.c | 5 ++--- fs/btrfs/volumes.c | 9 +++------ 13 files changed, 32 insertions(+), 63 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 08b14449fabe..3864aec520b3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1057,8 +1057,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - ret = btrfs_del_item(trans, root, path); - return ret; + return btrfs_del_item(trans, root, path); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e245b8c4c340..7267b2502665 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4002,8 +4002,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(trans, path, new_key, split_offset); - return ret; + return split_item(trans, path, new_key, split_offset); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 4b7d9015e0da..2286bee2c6d3 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1137,8 +1137,8 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_record_root_in_trans(trans, node->root); if (ret) return ret; - ret = btrfs_update_delayed_inode(trans, node->root, path, node); - return ret; + + return btrfs_update_delayed_inode(trans, node->root, path, node); } /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 12d91407bb60..5d8dcaaf11fe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -175,7 +175,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, const u32 step = min(fs_info->nodesize, PAGE_SIZE); const u32 nr_steps = eb->len / step; phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; - int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; @@ -197,9 +196,8 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); } - ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, - paddrs, step, mirror_num); - return ret; + return btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, + eb->start, paddrs, step, mirror_num); } /* @@ -2145,11 +2143,10 @@ static int load_global_roots(struct btrfs_root *tree_root) return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) return ret; - ret = load_global_roots_objectid(tree_root, path, - BTRFS_FREE_SPACE_TREE_OBJECTID, - "free space"); - return ret; + return load_global_roots_objectid(tree_root, path, + BTRFS_FREE_SPACE_TREE_OBJECTID, + "free space"); } static int btrfs_read_roots(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index bb2ca1c9c7b0..d0dd50f7d279 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -187,8 +187,6 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, struct extent_changeset *changeset, int set) { - int ret; - if (!changeset) return 0; if (set && (state->state & bits) == bits) @@ -196,9 +194,8 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, if (!set && (state->state & bits) == 0) return 0; changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; + + return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); } static inline struct extent_state *next_state(struct extent_state *state) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 04a266bb189b..3b840a4fdf1c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2559,7 +2559,6 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; - u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; @@ -2568,8 +2567,7 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - ret = btrfs_get_alloc_profile(fs_info, flags); - return ret; + return btrfs_get_alloc_profile(fs_info, flags); } static u64 first_logical_byte(struct btrfs_fs_info *fs_info) @@ -4191,10 +4189,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, else trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - return ret; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 87425a243040..69edf5f44bda 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1272,8 +1272,7 @@ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); - ret = extents_locked; - return ret; + return extents_locked; } copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 1ad2ad384b9e..a66ce9ef3aff 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1525,9 +1525,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - ret = 0; - - return ret; + return 0; } static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b73e1dd97208..a864f8c99729 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -371,14 +371,13 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 objectid) { struct btrfs_key key; - int ret; + key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(struct btrfs_inode_item)); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); } int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 912343fc9a73..cd3baeadda5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2372,7 +2372,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); - int ret; /* * The range must cover part of the @locked_folio, or a return of 1 @@ -2381,10 +2380,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_next_pos(locked_folio))); - if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_folio, start, end); - return ret; - } + if (should_nocow(inode, start, end)) + return run_delalloc_nocow(inode, locked_folio, start, end); if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && @@ -2392,11 +2389,9 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, - true); + return run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else - ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); - return ret; + return cow_file_range(inode, locked_folio, start, end, NULL, 0); } void btrfs_split_delalloc_extent(struct btrfs_inode *inode, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ae4a1b76646c..14d393a5853d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -694,7 +694,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, u64 dst) { - int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; @@ -707,8 +706,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.type = BTRFS_QGROUP_RELATION_KEY; key.offset = dst; - ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - return ret; + return btrfs_insert_empty_item(trans, quota_root, path, &key, 0); } static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, @@ -833,9 +831,7 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) if (ret > 0) return -ENOENT; - ret = btrfs_del_item(trans, quota_root, path); - - return ret; + return btrfs_del_item(trans, quota_root, path); } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, @@ -2655,10 +2651,8 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, return ret; } - if (root_level == 0) { - ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); - return ret; - } + if (root_level == 0) + return btrfs_qgroup_trace_leaf_items(trans, root_eb); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5bfefc3e9c06..310b7d817a27 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3254,7 +3254,6 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, struct btrfs_key key; bool found = false; int i; - int ret; if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) return 0; @@ -3278,8 +3277,8 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(block_group, NULL, space_cache_ino); - return ret; + + return delete_block_group_cache(block_group, NULL, space_cache_ino); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8a08412f3529..9e52a6f8f7af 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2316,9 +2316,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } - ret = btrfs_commit_transaction(trans); - - return ret; + return btrfs_commit_transaction(trans); error_undo: if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -7167,7 +7165,6 @@ static int read_one_dev(struct extent_buffer *leaf, struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; - int ret; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; @@ -7267,8 +7264,8 @@ static int read_one_dev(struct extent_buffer *leaf, atomic64_add(device->total_bytes - device->bytes_used, &fs_info->free_chunk_space); } - ret = 0; - return ret; + + return 0; } int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) From a464ed9834d76e0709c55b2c763063751d0f35df Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 10 Dec 2025 06:39:32 +0100 Subject: [PATCH 015/137] btrfs: rename btrfs_create_block_group_cache to btrfs_create_block_group struct btrfs_block_group used to be called struct btrfs_block_group_cache but got renamed to btrfs_block_group with commit 32da5386d9a4 ("btrfs: rename btrfs_block_group_cache"). Rename btrfs_create_block_group_cache() to btrfs_create_block_group() to reflect that change. Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3864aec520b3..e417aba4c4c7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2265,7 +2265,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return 0; } -static struct btrfs_block_group *btrfs_create_block_group_cache( +static struct btrfs_block_group *btrfs_create_block_group( struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; @@ -2369,7 +2369,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); - cache = btrfs_create_block_group_cache(info, key->objectid); + cache = btrfs_create_block_group(info, key->objectid); if (!cache) return -ENOMEM; @@ -2490,7 +2490,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) struct btrfs_block_group *bg; map = rb_entry(node, struct btrfs_chunk_map, rb_node); - bg = btrfs_create_block_group_cache(fs_info, map->start); + bg = btrfs_create_block_group(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; @@ -2885,7 +2885,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran btrfs_set_log_full_commit(trans); - cache = btrfs_create_block_group_cache(fs_info, chunk_offset); + cache = btrfs_create_block_group(fs_info, chunk_offset); if (!cache) return ERR_PTR(-ENOMEM); From c53d61e24086fc7893fa03b148f930c08bd3cd77 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 9 Dec 2025 18:10:30 +0100 Subject: [PATCH 016/137] btrfs: simplify internal btrfs_printk helpers The printk() can be compiled out depending on CONFIG_PRINTK, this is reflected in our helpers. The indirection is provided by btrfs_printk() used in the ratelimited and RCU wrapper macros. Drop the btrfs_printk() helper and define the ratelimit and RCU helpers directly when CONFIG_PRINTK is undefined. This will allow further changes to the _btrfs_printk() interface (which is internal), any message in other code should use the level-specific helpers. Signed-off-by: David Sterba --- fs/btrfs/messages.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index d8c0bd17dcda..7049976342a5 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -23,9 +23,6 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_PRINTK -#define btrfs_printk(fs_info, fmt, args...) \ - _btrfs_printk(fs_info, fmt, ##args) - __printf(2, 3) __cold void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); @@ -34,6 +31,13 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) + +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) + #endif /* @@ -78,10 +82,12 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0) #endif +#ifdef CONFIG_PRINTK + #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ - btrfs_printk(fs_info, fmt, ##args); \ + _btrfs_printk(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) @@ -93,10 +99,12 @@ do { \ \ rcu_read_lock(); \ if (__ratelimit(&_rs)) \ - btrfs_printk(fs_info, fmt, ##args); \ + _btrfs_printk(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) +#endif + #ifdef CONFIG_BTRFS_ASSERT __printf(1, 2) From 01c8634c695d199de4a4669c5d684bbec148e71b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 9 Dec 2025 18:10:31 +0100 Subject: [PATCH 017/137] btrfs: pass level to _btrfs_printk() to avoid parsing level from string There's code in _btrfs_printk() to parse the message level from the input string so we can augment the message with the level description for better visibility in the logs. The parsing code has evolved over time, see commits: - 40f7828b36e3b9 ("btrfs: better handle btrfs_printk() defaults") - 262c5e86fec7cf ("printk/btrfs: handle more message headers") - 533574c6bc30cf ("btrfs: use printk_get_level and printk_skip_level, add __printf, fix fallout") - 4da35113426d16 ("btrfs: add varargs to btrfs_error") As we are using the specific level helpers everywhere we can simply pass the message level so we don't have to parse it. The proper printk() message header is created as KERN_SOH + "level". Signed-off-by: David Sterba --- fs/btrfs/messages.c | 26 ++++++------------------ fs/btrfs/messages.h | 49 ++++++++++++++++++++++----------------------- 2 files changed, 30 insertions(+), 45 deletions(-) diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 2f853de44473..6190777924bf 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -211,33 +211,19 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +__printf(3, 4) __cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...) { - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; - int kern_level; - const char *type = logtypes[4]; - struct ratelimit_state *ratelimit = &printk_limits[4]; + const char *type = logtypes[level]; + struct ratelimit_state *ratelimit = &printk_limits[level]; #ifdef CONFIG_PRINTK_INDEX printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); #endif va_start(args, fmt); - - while ((kern_level = printk_get_level(fmt)) != 0) { - size_t size = printk_skip_level(fmt) - fmt; - - if (kern_level >= '0' && kern_level <= '7') { - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } - fmt += size; - } - vaf.fmt = fmt; vaf.va = &args; @@ -247,10 +233,10 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, char statestr[STATE_STRING_BUF_LEN]; btrfs_state_to_string(fs_info, statestr); - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + _printk(KERN_SOH "%dBTRFS %s (device %s%s): %pV\n", level, type, fs_info->sb->s_id, statestr, &vaf); } else { - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + _printk(KERN_SOH "%dBTRFS %s: %pV\n", level, type, &vaf); } } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 7049976342a5..d4e4cad06092 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -23,19 +23,18 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_PRINTK -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +__printf(3, 4) __cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...); #else -#define btrfs_printk(fs_info, fmt, args...) \ +#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +#define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif @@ -44,38 +43,38 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); * Print a message with filesystem info, enclosed in RCU protection. */ #define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args) #define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args) #define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args) /* * Wrappers that use a ratelimited printk */ #define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) + fs_info, LOGLEVEL_DEBUG, fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) + fs_info, LOGLEVEL_DEBUG, fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_in_rcu(fs_info, LOGLEVEL_DEBUG, fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEl_DEBUG, fmt, ##args) #else /* When printk() is no_printk(), expand to no-op. */ #define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0) @@ -84,14 +83,14 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #ifdef CONFIG_PRINTK -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - _btrfs_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ +#define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ +do { \ + rcu_read_lock(); \ + _btrfs_printk(fs_info, level, fmt, ##args); \ + rcu_read_unlock(); \ } while (0) -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +#define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ @@ -99,7 +98,7 @@ do { \ \ rcu_read_lock(); \ if (__ratelimit(&_rs)) \ - _btrfs_printk(fs_info, fmt, ##args); \ + _btrfs_printk(fs_info, level, fmt, ##args); \ rcu_read_unlock(); \ } while (0) From d6ee3fa8b31c42bb808c92177a24d051a509eb42 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 9 Dec 2025 18:10:32 +0100 Subject: [PATCH 018/137] btrfs: remove ASSERT compatibility for gcc < 8.x The minimum gcc version is 8 since 118c40b7b50340 ("kbuild: require gcc-8 and binutils-2.30"), the workaround for missing __VA_OPT__ support is not needed. Signed-off-by: David Sterba --- fs/btrfs/messages.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index d4e4cad06092..943e53980945 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -120,7 +120,6 @@ static inline void verify_assert_printk_format(const char *fmt, ...) { */ #define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ -#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 /* * Assertion with optional printk() format. * @@ -158,22 +157,6 @@ do { \ } \ } while(0) -#else - -/* For GCC < 8.x only the simple output. */ - -#define ASSERT(cond, args...) \ -do { \ - verify_assert_printk_format("check the format string" args); \ - if (!likely(cond)) { \ - pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ - #cond, (long)(cond), __FILE__, __LINE__); \ - BUG(); \ - } \ -} while(0) - -#endif - #else /* Compile check the @cond expression but don't generate any code. */ #define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond) From 3dcdcb7177632559e9392e755636a60e33e867b0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 9 Dec 2025 07:55:03 +1030 Subject: [PATCH 019/137] btrfs: shrink the size of btrfs_bio This is done by: - Shrink the size of btrfs_bio::mirror_num From 32 bits unsigned int to u16. Normally btrfs mirror number is either 0 (all profiles), 1 (all profiles), 2 (DUP/RAID1/RAID10/RAID5), 3 (RAID1C3) or 4 (RAID1C4). But for RAID6 the mirror number can go as large as the number of devices of that chunk. Currently the limit for number of devices for a data chunk is BTRFS_MAX_DEVS(), which is around 500 for the default 16K nodesize. And if going the max 64K nodesize, we can have a little over 2000 devices for a chunk. Although I'd argue it's way overkilled, we don't reject such cases yet thus u8 is not going to cut it, and have to use u16 (max out at 64K). - Use bit fields for boolean members Although it's not always safe for racy call sites, those members are safe. * csum_search_commit_root * is_scrub Those two are set immediately after bbio allocation and no more writes after allocation, thus they are very safe. * async_csum * can_use_append Those two are set for each split range, and after that there is no writes into those two members in different threads, thus they are also safe. And there are spaces for 4 more bits before increasing the size of btrfs_bio again, which should be future proof enough. - Reorder the structure members Now we always put the largest member first (after the huge 120 bytes union), making it easier to fill any holes. This reduce the size of btrfs_bio by 8 bytes, from 312 bytes to 304 bytes. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 246c7519dff3..157cdfa2f78a 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -68,32 +68,33 @@ struct btrfs_bio { struct btrfs_tree_parent_check parent_check; }; + /* For internal use in read end I/O handling */ + struct work_struct end_io_work; + /* End I/O information supplied to btrfs_bio_alloc */ btrfs_bio_end_io_t end_io; void *private; - /* For internal use in read end I/O handling */ - unsigned int mirror_num; atomic_t pending_ios; - struct work_struct end_io_work; + u16 mirror_num; /* Save the first error status of split bio. */ blk_status_t status; /* Use the commit root to look up csums (data read bio only). */ - bool csum_search_commit_root; + bool csum_search_commit_root:1; /* * Since scrub will reuse btree inode, we need this flag to distinguish * scrub bios. */ - bool is_scrub; + bool is_scrub:1; /* Whether the csum generation for data write is async. */ - bool async_csum; + bool async_csum:1; /* Whether the bio is written using zone append. */ - bool can_use_append; + bool can_use_append:1; /* * This member must come last, bio_alloc_bioset will allocate enough From 4273db18a84ea3042b67864939e69b79eca50235 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 15 Dec 2025 11:38:18 +0100 Subject: [PATCH 020/137] btrfs: zoned: re-flow prepare_allocation_zoned() Re-flow prepare allocation zoned to make it a bit more readable by returning early and removing unnecessary indentations. This patch does not change any functionality. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b840a4fdf1c..1dcd69fe97ed 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4284,37 +4284,44 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info) { + struct btrfs_block_group *block_group; + if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); if (fs_info->treelog_bg) ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); - } else if (ffe_ctl->for_data_reloc) { + return 0; + } + + if (ffe_ctl->for_data_reloc) { spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); - } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { - struct btrfs_block_group *block_group; - - spin_lock(&fs_info->zone_active_bgs_lock); - list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { - /* - * No lock is OK here because avail is monotonically - * decreasing, and this is just a hint. - */ - u64 avail = block_group->zone_capacity - block_group->alloc_offset; - - if (block_group_bits(block_group, ffe_ctl->flags) && - block_group->space_info == space_info && - avail >= ffe_ctl->num_bytes) { - ffe_ctl->hint_byte = block_group->start; - break; - } - } - spin_unlock(&fs_info->zone_active_bgs_lock); + return 0; } + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotonically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + return 0; } From 3d47c0c8b57073b1389dd9c53291d81eb8951e18 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Dec 2025 18:36:32 +0000 Subject: [PATCH 021/137] btrfs: remove duplicated root key setup in btrfs_create_tree() There's no need for an on stack key to define the root's key as we have already defined the key in the root itself. So remove the stack variable and use the key in the root. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5d8dcaaf11fe..7dea5615bd8f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -802,7 +802,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; - struct btrfs_key key; unsigned int nofs_flag; int ret = 0; @@ -851,10 +850,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); - key.objectid = objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + ret = btrfs_insert_root(trans, tree_root, &root->root_key, &root->root_item); if (ret) goto fail; From 6d0f25cdd8e3248ee6e4899722d610083fe5aa6d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 12 Dec 2025 17:22:36 +0000 Subject: [PATCH 022/137] btrfs: update stale comment in __cow_file_range_inline() We mention that the reserved data space is page size aligned but that's not true anymore, as it's sector size aligned instead. In commit 0bb067ca64e3 ("btrfs: fix the qgroup data free range for inline data extents") we updated the amount passed to btrfs_qgroup_free_data() from page size to sector size, but forgot to update the comment. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cd3baeadda5c..e83a881fe202 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -690,8 +690,8 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. - * And at reserve time, it's always aligned to page size, so - * just free one page here. + * And at reserve time, it's always aligned to sector size, so + * just free one sector here. * * If we fallback to non-inline (ret == 1) due to -ENOSPC, then we need * to keep the data reservation. From cb73493cae906a7d6668b0e8077eb3a4ef0b2926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 15:51:47 +0000 Subject: [PATCH 023/137] btrfs: avoid transaction commit on error in del_balance_item() There's no point in committing the transaction if we failed to delete the item, since we haven't done anything before. Also stop using two variables for tracking the return value and use only 'ret'. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9e52a6f8f7af..a89243a57fde 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3691,7 +3691,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_key key; - int ret, err; + int ret; path = btrfs_alloc_path(); if (!path) @@ -3718,9 +3718,11 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret == 0) + ret = btrfs_commit_transaction(trans); + else + btrfs_end_transaction(trans); + return ret; } From 8670a25ecb2fbc35d4e58f8f522e7d5b735d6778 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 11:56:04 +0000 Subject: [PATCH 024/137] btrfs: use single return variable in btrfs_find_orphan_roots() We use both 'ret' and 'err' which is a pattern that generates confusion and resulted in subtle bugs in the past. Remove 'err' and use only 'ret'. Also move simplify the error flow by directly returning from the function instead of breaking of the loop, since there are no resources to cleanup after the loop. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6a7e297ab0a7..a7171112d638 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -217,8 +217,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_root *root; - int err = 0; - int ret; path = btrfs_alloc_path(); if (!path) @@ -230,20 +228,19 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) while (1) { u64 root_objectid; + int ret; ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - break; - } + if (ret < 0) + return ret; leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(tree_root, path); if (ret < 0) - err = ret; - if (ret != 0) - break; + return ret; + else if (ret > 0) + return 0; leaf = path->nodes[0]; } @@ -252,34 +249,33 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) if (key.objectid != BTRFS_ORPHAN_OBJECTID || key.type != BTRFS_ORPHAN_ITEM_KEY) - break; + return 0; root_objectid = key.offset; key.offset++; root = btrfs_get_fs_root(fs_info, root_objectid, false); - err = PTR_ERR_OR_ZERO(root); - if (err && err != -ENOENT) { + ret = PTR_ERR_OR_ZERO(root); + if (ret && ret != -ENOENT) { break; - } else if (err == -ENOENT) { + } else if (ret == -ENOENT) { struct btrfs_trans_handle *trans; btrfs_release_path(path); trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { - err = PTR_ERR(trans); - btrfs_handle_fs_error(fs_info, err, + ret = PTR_ERR(trans); + btrfs_handle_fs_error(fs_info, ret, "Failed to start trans to delete orphan item"); - break; + return ret; } - err = btrfs_del_orphan_item(trans, tree_root, - root_objectid); + ret = btrfs_del_orphan_item(trans, tree_root, root_objectid); btrfs_end_transaction(trans); - if (err) { - btrfs_handle_fs_error(fs_info, err, + if (ret) { + btrfs_handle_fs_error(fs_info, ret, "Failed to delete root orphan item"); - break; + return ret; } continue; } @@ -307,7 +303,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) btrfs_put_root(root); } - return err; + return 0; } /* drop the root item for 'key' from the tree root */ From 19231903711e412dd2fe383dac0e48e5919bba4b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 12:02:18 +0000 Subject: [PATCH 025/137] btrfs: remove redundant path release in btrfs_find_orphan_roots() There's no need to release the path in the if branch used when the root does not exists since we released the path before the call to btrfs_get_fs_root(). So remove that redundant btrfs_release_path() call. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a7171112d638..40f9bc9485e8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -261,8 +261,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) } else if (ret == -ENOENT) { struct btrfs_trans_handle *trans; - btrfs_release_path(path); - trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); From 8bc612906f2f77279c159242d0413a8a44aec126 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 12:31:11 +0000 Subject: [PATCH 026/137] btrfs: don't call btrfs_handle_fs_error() after failure to join transaction In btrfs_find_orphan_roots() we don't need to call btrfs_handle_fs_error() if we fail to join a transaction. This is because we haven't done anything yet regarding the current root and previous iterations of the loop dealt with other roots, so there's nothing we need to undo. Instead log an error message and return the error to the caller, which will result either in a mount failure or remount failure (the only contexts it's called from). Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 40f9bc9485e8..a6880ac5f060 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -264,8 +264,9 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_handle_fs_error(fs_info, ret, - "Failed to start trans to delete orphan item"); + btrfs_err(fs_info, + "failed to join transaction to delete orphan item: %d", + ret); return ret; } ret = btrfs_del_orphan_item(trans, tree_root, root_objectid); From c9b640cefac04112f63637b13d7ce9fbeecd6fb0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 12:41:16 +0000 Subject: [PATCH 027/137] btrfs: don't call btrfs_handle_fs_error() after failure to delete orphan item In btrfs_find_orphan_roots() we don't need to call btrfs_handle_fs_error() if we fail to delete the orphan item for the current root. This is because we haven't done anything yet regarding the current root and previous iterations of the loop dealt with other roots, so there's nothing we need to undo. Instead log an error message and return the error to the caller, which will result either in a mount failure or remount failure (the only contexts it's called from). Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/root-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a6880ac5f060..37a4173c0a0b 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -272,8 +272,8 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) ret = btrfs_del_orphan_item(trans, tree_root, root_objectid); btrfs_end_transaction(trans); if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Failed to delete root orphan item"); + btrfs_err(fs_info, + "failed to delete root orphan item: %d", ret); return ret; } continue; From d15a190d9efd59398ff08133238268784090066d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 16:01:01 +0000 Subject: [PATCH 028/137] btrfs: don't call btrfs_handle_fs_error() in qgroup_account_snapshot() There's no need to call btrfs_handle_fs_error() as we are inside a transaction and we propagate the error returned from btrfs_write_and_wait_transaction() to the caller and it ends going up the call chain to btrfs_commit_transaction() (returned by the call to create_pending_snapshots()), where we jump to the 'unlock_reloc' label and end up calling cleanup_transaction(), which aborts the transaction. This is odd given that we have a transaction handle and that in the transaction commit path any error makes us abort the transaction and, besides another place inside btrfs_commit_transaction(), it's the only place that calls btrfs_handle_fs_error(). Remove the btrfs_handle_fs_error() call and replace it with an error message so that if it happens we know what went wrong during the transaction commit. Also annotate the condition in the if statement with 'unlikely' since this is not expected to happen. We've been wanting to remove btrfs_handle_fs_error(), so this removes one user that does not even need it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bd03f465e2d3..a2a1c0aaeb75 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1621,9 +1621,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, goto out; switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); - if (ret) - btrfs_handle_fs_error(fs_info, ret, - "Error while writing out transaction for qgroup"); + if (unlikely(ret)) + btrfs_err(fs_info, +"error while writing out transaction during qgroup snapshot accounting: %d", ret); out: /* From 68d4ece9c30e966ebb92798ea54a7777daff384b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Dec 2025 16:27:57 +0000 Subject: [PATCH 029/137] btrfs: don't call btrfs_handle_fs_error() in btrfs_commit_transaction() There's no need to call btrfs_handle_fs_error() as we are inside a transaction and if we get an error we jump to the 'scrub_continue' label and end up calling cleanup_transaction(), which aborts the transaction. This is odd given that we have a transaction handle and that in the transaction commit path any error makes us abort the transaction and it's the only place that calls btrfs_handle_fs_error(). Remove the btrfs_handle_fs_error() call and replace it with an error message so that if it happens we know what went wrong during the transaction commit. Also annotate the condition in the if statement with 'unlikely' since this is not expected to happen. We've been wanting to remove btrfs_handle_fs_error(), so this removes one user that does not even needs it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a2a1c0aaeb75..d29d32dff6e3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2550,9 +2550,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up_process(fs_info->cleaner_kthread); ret = btrfs_write_and_wait_transaction(trans); - if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Error while writing out transaction"); + if (unlikely(ret)) { + btrfs_err(fs_info, "error while writing out transaction: %d", ret); mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } From 6a5ac228d4ad05c250de0bf06713d81bfb70c714 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 17 Dec 2025 14:41:37 +0100 Subject: [PATCH 030/137] btrfs: zoned: show statistics about zoned filesystems in mountstats Add statistics output to /proc//mountstats for zoned BTRFS, similar to the zoned statistics from XFS in mountstats. The output for /proc//mountstats on an example filesystem will be as follows: device /dev/vda mounted on /mnt with fstype btrfs zoned statistics: active block-groups: 7 reclaimable: 0 unused: 5 need reclaim: false data relocation block-group: 1342177280 active zones: start: 1073741824, wp: 268419072 used: 0, reserved: 268419072, unusable: 0 start: 1342177280, wp: 0 used: 0, reserved: 0, unusable: 0 start: 1610612736, wp: 49152 used: 16384, reserved: 16384, unusable: 16384 start: 1879048192, wp: 950272 used: 131072, reserved: 622592, unusable: 196608 start: 2147483648, wp: 212238336 used: 0, reserved: 212238336, unusable: 0 start: 2415919104, wp: 0 used: 0, reserved: 0, unusable: 0 start: 2684354560, wp: 0 used: 0, reserved: 0, unusable: 0 Reviewed-by: Naohiro Aota Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 13 ++++++++++++ fs/btrfs/zoned.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 8 +++++++ 3 files changed, 75 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a931555e6dc..d64d303b6edc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2483,6 +2483,18 @@ static void btrfs_shutdown(struct super_block *sb) } #endif +static int btrfs_show_stats(struct seq_file *seq, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + + if (btrfs_is_zoned(fs_info)) { + btrfs_show_zoned_stats(fs_info, seq); + return 0; + } + + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2498,6 +2510,7 @@ static const struct super_operations btrfs_super_ops = { .unfreeze_fs = btrfs_unfreeze, .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, + .show_stats = btrfs_show_stats, #ifdef CONFIG_BTRFS_EXPERIMENTAL .remove_bdev = btrfs_remove_bdev, .shutdown = btrfs_shutdown, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 359a98e6de85..a58a4336a5b7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2984,3 +2984,57 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num return 0; } + +void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) +{ + struct btrfs_block_group *bg; + u64 data_reloc_bg; + u64 treelog_bg; + + seq_puts(seq, "\n zoned statistics:\n"); + + spin_lock(&fs_info->zone_active_bgs_lock); + seq_printf(seq, "\tactive block-groups: %zu\n", + list_count_nodes(&fs_info->zone_active_bgs)); + spin_unlock(&fs_info->zone_active_bgs_lock); + + spin_lock(&fs_info->unused_bgs_lock); + seq_printf(seq, "\t reclaimable: %zu\n", + list_count_nodes(&fs_info->reclaim_bgs)); + seq_printf(seq, "\t unused: %zu\n", list_count_nodes(&fs_info->unused_bgs)); + spin_unlock(&fs_info->unused_bgs_lock); + + seq_printf(seq,"\t need reclaim: %s\n", + str_true_false(btrfs_zoned_should_reclaim(fs_info))); + + data_reloc_bg = data_race(fs_info->data_reloc_bg); + if (data_reloc_bg) + seq_printf(seq, "\tdata relocation block-group: %llu\n", + data_reloc_bg); + treelog_bg = data_race(fs_info->treelog_bg); + if (treelog_bg) + seq_printf(seq, "\ttree-log block-group: %llu\n", treelog_bg); + + spin_lock(&fs_info->zone_active_bgs_lock); + seq_puts(seq, "\tactive zones:\n"); + list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) { + u64 start; + u64 alloc_offset; + u64 used; + u64 reserved; + u64 zone_unusable; + + spin_lock(&bg->lock); + start = bg->start; + alloc_offset = bg->alloc_offset; + used = bg->used; + reserved = bg->reserved; + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); + + seq_printf(seq, + "\t start: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n", + start, alloc_offset, used, reserved, zone_unusable); + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5cefdeb08b7b..2fdc88c6fa3c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "messages.h" #include "volumes.h" #include "disk-io.h" @@ -96,6 +97,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); +void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq); + #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -275,6 +278,11 @@ static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space return 0; } +static inline int btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 2ef2e97fe74e937a246681713434ca16d8e5552e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 17 Dec 2025 14:41:38 +0100 Subject: [PATCH 031/137] btrfs: move space_info_flag_to_str() to space-info.h Move space_info_flag_to_str() to space-info.h and as it now isn't static to space-info.c any more prefix it with 'btrfs_'. This way it can be re-used in other places. Reviewed-by: Filipe Manana Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 18 +----------------- fs/btrfs/space-info.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3f08e450f796..857e4fd2c77e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -606,22 +606,6 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) -static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) -{ - switch (space_info->flags) { - case BTRFS_BLOCK_GROUP_SYSTEM: - return "SYSTEM"; - case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: - return "DATA+METADATA"; - case BTRFS_BLOCK_GROUP_DATA: - return "DATA"; - case BTRFS_BLOCK_GROUP_METADATA: - return "METADATA"; - default: - return "UNKNOWN"; - } -} - static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) { DUMP_BLOCK_RSV(fs_info, global_block_rsv); @@ -634,7 +618,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) static void __btrfs_dump_space_info(const struct btrfs_space_info *info) { const struct btrfs_fs_info *fs_info = info->fs_info; - const char *flag_str = space_info_flag_to_str(info); + const char *flag_str = btrfs_space_info_type_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 446c0614ad4a..0703f24b23f7 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -307,4 +307,20 @@ int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); +static inline const char *btrfs_space_info_type_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + #endif /* BTRFS_SPACE_INFO_H */ From 9da49784ae9fff59b6056d8a4018d95a16f9915b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 17 Dec 2025 14:41:39 +0100 Subject: [PATCH 032/137] btrfs: zoned: print block-group type for zoned statistics When printing the zoned statistics, also include the block-group type in the block-group listing output. The updated output looks as follows: device /dev/vda mounted on /mnt with fstype btrfs zoned statistics: active block-groups: 9 reclaimable: 0 unused: 2 need reclaim: false data relocation block-group: 3221225472 active zones: start: 1073741824, wp: 268419072 used: 268419072, reserved: 0, unusable: 0 (DATA) start: 1342177280, wp: 0 used: 0, reserved: 0, unusable: 0 (DATA) start: 1610612736, wp: 81920 used: 16384, reserved: 16384, unusable: 49152 (SYSTEM) start: 1879048192, wp: 2031616 used: 1458176, reserved: 65536, unusable: 507904 (METADATA) start: 2147483648, wp: 268419072 used: 268419072, reserved: 0, unusable: 0 (DATA) start: 2415919104, wp: 268419072 used: 268419072, reserved: 0, unusable: 0 (DATA) start: 2684354560, wp: 268419072 used: 268419072, reserved: 0, unusable: 0 (DATA) start: 2952790016, wp: 65536 used: 65536, reserved: 0, unusable: 0 (DATA) start: 3221225472, wp: 0 used: 0, reserved: 0, unusable: 0 (DATA) Reviewed-by: Filipe Manana Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a58a4336a5b7..2e861eef5cd8 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -3023,6 +3023,7 @@ void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) u64 used; u64 reserved; u64 zone_unusable; + const char *typestr = btrfs_space_info_type_str(bg->space_info); spin_lock(&bg->lock); start = bg->start; @@ -3033,8 +3034,8 @@ void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) spin_unlock(&bg->lock); seq_printf(seq, - "\t start: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n", - start, alloc_offset, used, reserved, zone_unusable); + "\t start: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu (%s)\n", + start, alloc_offset, used, reserved, zone_unusable, typestr); } spin_unlock(&fs_info->zone_active_bgs_lock); } From c28214bde6da6e05554a0e5b6375b7b65f98cdbf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Dec 2025 19:18:43 +1030 Subject: [PATCH 033/137] btrfs: refactor the main loop of cow_file_range() Currently inside the main loop of cow_file_range(), we do the following sequence: - Reserve an extent - Lock the IO tree range - Create an IO extent map - Create an ordered extent Every step will need extra steps to do cleanup in the following order: - Drop the newly created extent map - Unlock extent range and cleanup the involved folios - Free the reserved extent However currently the error handling is done inconsistently: - Extent map drop is handled in a dedicated tag Out of the main loop, make it much harder to track. - The extent unlock and folios cleanup is done separately The extent is unlocked through btrfs_unlock_extent(), then extent_clear_unlock_delalloc() again in a dedicated tag. Meanwhile all other callsites (compression/encoded/nocow) all just call extent_clear_unlock_delalloc() to handle unlock and folio clean up in one go. - Reserved extent freeing is handled in a dedicated tag Out of the main loop, make it much harder to track. - Error handling of btrfs_reloc_clone_csums() is relying out-of-loop tags This is due to the special requirement to finish ordered extents to handle the metadata reserved space. Enhance the error handling and align the behavior by: - Introduce a dedicated cow_one_range() helper Which do the reserve/lock/allocation in the helper. And also handle the errors inside the helper. No more dedicated tags out of the main loop. - Use a single extent_clear_unlock_delalloc() to unlock and cleanup folios - Move the btrfs_reloc_clone_csums() error handling into the new helper Thankfully it's not that complex compared to other cases. And since we're here, also reduce the width of the following local variables to u32: - cur_alloc_size - min_alloc_size Each allocation won't go beyond 128M, thus u32 is more than enough. - blocksize The maximum is 64K, no need for u64. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 245 +++++++++++++++++++++++++++-------------------- 1 file changed, 142 insertions(+), 103 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e83a881fe202..b95dab8ac8a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1274,6 +1274,133 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, return alloc_hint; } +/* + * Handle COW for one range. + * + * @ins: The key representing the allocated range. + * @file_offset: The file offset of the COW range + * @num_bytes: The expected length of the COW range + * The actually allocated length can be smaller than it. + * @min_alloc_size: The minimal extent size. + * @alloc_hint: The hint for the extent allocator. + * @ret_alloc_size: The COW range handles by this function. + * + * Return 0 if everything is fine and update @ret_alloc_size updated. The + * range is still locked, and caller should unlock the range after everything + * is done or for error handling. + * + * Return <0 for error and @is updated for where the extra cleanup should + * happen. The range [file_offset, file_offset + ret_alloc_size) will be + * cleaned up by this function. + */ +static int cow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct btrfs_key *ins, struct extent_state **cached, + u64 file_offset, u32 num_bytes, u32 min_alloc_size, + u64 alloc_hint, u32 *ret_alloc_size) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; + struct extent_map *em; + u32 cur_len = 0; + u64 cur_end; + int ret; + + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, + 0, alloc_hint, ins, true, true); + if (ret < 0) { + *ret_alloc_size = cur_len; + return ret; + } + + cur_len = ins->offset; + cur_end = file_offset + cur_len - 1; + + file_extent.disk_bytenr = ins->objectid; + file_extent.disk_num_bytes = ins->offset; + file_extent.num_bytes = ins->offset; + file_extent.ram_bytes = ins->offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; + + /* + * Locked range will be released either during error clean up (inside + * this function or by the caller for previously successful ranges) or + * after the whole range is finished. + */ + btrfs_lock_extent(&inode->io_tree, file_offset, cur_end, cached); + em = btrfs_create_io_em(inode, file_offset, &file_extent, BTRFS_ORDERED_REGULAR); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto free_reserved; + } + btrfs_free_extent_map(em); + + ordered = btrfs_alloc_ordered_extent(inode, file_offset, &file_extent, + 1U << BTRFS_ORDERED_REGULAR); + if (IS_ERR(ordered)) { + btrfs_drop_extent_map_range(inode, file_offset, cur_end, false); + ret = PTR_ERR(ordered); + goto free_reserved; + } + + if (btrfs_is_data_reloc_root(root)) { + ret = btrfs_reloc_clone_csums(ordered); + + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() at + * free_reserved label to free meta of this ordered extent, as + * its meta should be freed by btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ + if (ret) + btrfs_drop_extent_map_range(inode, file_offset, + cur_end, false); + } + btrfs_put_ordered_extent(ordered); + btrfs_dec_block_group_reservations(fs_info, ins->objectid); + /* + * Error handling for btrfs_reloc_clone_csums(). + * + * Treat the range as finished, thus only clear EXTENT_LOCKED | EXTENT_DELALLOC. + * The accounting will be done by ordered extents. + */ + if (unlikely(ret < 0)) { + btrfs_cleanup_ordered_extents(inode, file_offset, cur_len); + extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); + } + *ret_alloc_size = cur_len; + return ret; + +free_reserved: + extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, file_offset, cur_len, NULL); + btrfs_dec_block_group_reservations(fs_info, ins->objectid); + btrfs_free_reserved_extent(fs_info, ins->objectid, ins->offset, true); + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); + *ret_alloc_size = cur_len; + /* + * We should not return -EAGAIN where it's a special return code for + * zoned to catch btrfs_reserved_extent(). + */ + ASSERT(ret != -EAGAIN); + return ret; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -1310,11 +1437,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - u64 cur_alloc_size = 0; - u64 min_alloc_size; - u64 blocksize = fs_info->sectorsize; + u32 min_alloc_size; + u32 blocksize = fs_info->sectorsize; + u32 cur_alloc_size = 0; struct btrfs_key ins; - struct extent_map *em; unsigned clear_bits; unsigned long page_ops; int ret = 0; @@ -1383,16 +1509,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { - struct btrfs_ordered_extent *ordered; - struct btrfs_file_extent file_extent; + ret = cow_one_range(inode, locked_folio, &ins, &cached, start, + num_bytes, min_alloc_size, alloc_hint, &cur_alloc_size); - ret = btrfs_reserve_extent(root, num_bytes, num_bytes, - min_alloc_size, 0, alloc_hint, - &ins, true, true); if (ret == -EAGAIN) { /* - * btrfs_reserve_extent only returns -EAGAIN for zoned - * file systems, which is an indication that there are + * cow_one_range() only returns -EAGAIN for zoned + * file systems (from btrfs_reserve_extent()), which + * is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at @@ -1421,79 +1545,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } if (ret < 0) goto out_unlock; - cur_alloc_size = ins.offset; - file_extent.disk_bytenr = ins.objectid; - file_extent.disk_num_bytes = ins.offset; - file_extent.num_bytes = ins.offset; - file_extent.ram_bytes = ins.offset; - file_extent.offset = 0; - file_extent.compression = BTRFS_COMPRESS_NONE; + /* We should not allocate an extent larger than requested.*/ + ASSERT(cur_alloc_size <= num_bytes); - /* - * Locked range will be released either during error clean up or - * after the whole range is finished. - */ - btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); - - em = btrfs_create_io_em(inode, start, &file_extent, - BTRFS_ORDERED_REGULAR); - if (IS_ERR(em)) { - btrfs_unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); - ret = PTR_ERR(em); - goto out_reserve; - } - btrfs_free_extent_map(em); - - ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1U << BTRFS_ORDERED_REGULAR); - if (IS_ERR(ordered)) { - btrfs_unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); - ret = PTR_ERR(ordered); - goto out_drop_extent_cache; - } - - if (btrfs_is_data_reloc_root(root)) { - ret = btrfs_reloc_clone_csums(ordered); - - /* - * Only drop cache here, and process as normal. - * - * We must not allow extent_clear_unlock_delalloc() - * at out_unlock label to free meta of this ordered - * extent, as its meta should be freed by - * btrfs_finish_ordered_io(). - * - * So we must continue until @start is increased to - * skip current ordered extent. - */ - if (ret) - btrfs_drop_extent_map_range(inode, start, - start + cur_alloc_size - 1, - false); - } - btrfs_put_ordered_extent(ordered); - - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - - if (num_bytes < cur_alloc_size) - num_bytes = 0; - else - num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; cur_alloc_size = 0; - - /* - * btrfs_reloc_clone_csums() error, since start is increased - * extent_clear_unlock_delalloc() at out_unlock label won't - * free metadata of current ordered extent, we're OK to exit. - */ - if (ret) - goto out_unlock; } extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1502,11 +1561,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, *done_offset = end; return ret; -out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); -out_reserve: - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1543,24 +1597,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* - * For the range (2). If we reserved an extent for our delalloc range - * (or a subrange) and failed to create the respective ordered extent, - * then it means that when we reserved the extent we decremented the - * extent's size from the data space_info's bytes_may_use counter and - * incremented the space_info's bytes_reserved counter by the same - * amount. We must make sure extent_clear_unlock_delalloc() does not try - * to decrement again the data space_info's bytes_may_use counter, - * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. - */ - if (cur_alloc_size) { - extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size - 1, - locked_folio, &cached, clear_bits, - page_ops); - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - } - - /* + * For the range (2) the error handling is done by cow_one_range() itself. + * Nothing needs to be done. + * * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in @@ -1575,7 +1614,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, end - start - cur_alloc_size + 1, NULL); } btrfs_err(fs_info, -"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", +"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%u: %d", __func__, btrfs_root_id(inode->root), btrfs_ino(inode), orig_start, end + 1 - orig_start, start, cur_alloc_size, ret); From 4cdb457a23751a1debebbf7d010300fe4eff47a8 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Thu, 18 Dec 2025 11:30:37 +0800 Subject: [PATCH 034/137] btrfs: remove unreachable return after btrfs_backref_panic() in btrfs_backref_finish_upper_links() The return statement after btrfs_backref_panic() is unreachable since btrfs_backref_panic() calls BUG() which never returns. Remove the return to unify it with the other calls to btrfs_backref_panic(). Signed-off-by: Zhen Ni Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 78da47a3d00e..9bb406f7dd30 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -3609,10 +3609,8 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, } rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node); - if (unlikely(rb_node)) { + if (unlikely(rb_node)) btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); - return -EUCLEAN; - } list_add_tail(&edge->list[UPPER], &upper->lower); From 858f32937c8ad5519c5daa9d5498963fe0bd0139 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 Dec 2025 13:04:21 +0000 Subject: [PATCH 035/137] btrfs: tag as unlikely error conditions in the transaction commit path Errors are unexpected during the transaction commit path, and when they happen we abort the transaction (by calling cleanup_transaction() under the label 'cleanup_transaction' in btrfs_commit_transaction()). So mark every error check in the transaction commit path as unlikely, to hint the compiler so that it can possibly generate better code, and make it clear for a reader about being unexpected. On a x86_84 box using gcc 14.2.0-19 from Debian, this resulted in a slight reduction of the module's text size. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939476 172568 15592 2127636 207714 fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939044 172568 15592 2127204 207564 fs/btrfs/btrfs.ko Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d29d32dff6e3..e2f993b1783f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1515,7 +1515,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); ret2 = btrfs_update_reloc_root(trans, root); - if (ret2) + if (unlikely(ret2)) return ret2; /* see comments in should_cow_block() */ @@ -1532,7 +1532,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); - if (ret2) + if (unlikely(ret2)) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); } @@ -1687,11 +1687,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); - if (pending->error) + if (unlikely(pending->error)) goto free_pending; pending->error = btrfs_get_free_objectid(tree_root, &objectid); - if (pending->error) + if (unlikely(pending->error)) goto free_fname; /* @@ -1707,7 +1707,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); - if (pending->error) + if (unlikely(pending->error)) goto clear_skip_qgroup; } @@ -1719,7 +1719,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->bytes_reserved, 1); parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); - if (ret) + if (unlikely(ret)) goto fail; cur_time = current_time(&parent_inode->vfs_inode); @@ -1736,7 +1736,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(parent_inode), &fname.disk_name, 0); - if (dir_item != NULL && !IS_ERR(dir_item)) { + if (unlikely(dir_item != NULL && !IS_ERR(dir_item))) { pending->error = -EEXIST; goto dir_item_existed; } else if (IS_ERR(dir_item)) { @@ -1873,7 +1873,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, btrfs_root_id(parent_root), pending->inherit); - if (ret < 0) + if (unlikely(ret < 0)) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, @@ -1939,7 +1939,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) list_for_each_entry_safe(pending, next, head, list) { list_del(&pending->list); ret = create_pending_snapshot(trans, pending); - if (ret) + if (unlikely(ret)) break; } return ret; @@ -2258,7 +2258,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (run_it) { ret = btrfs_start_dirty_block_groups(trans); - if (ret) + if (unlikely(ret)) goto lockdep_trans_commit_start_release; } } @@ -2308,7 +2308,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); - if (ret) + if (unlikely(ret)) goto lockdep_release; spin_lock(&fs_info->trans_lock); } @@ -2338,11 +2338,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); - if (ret) + if (unlikely(ret)) goto lockdep_release; ret = btrfs_run_delayed_items(trans); - if (ret) + if (unlikely(ret)) goto lockdep_release; /* @@ -2357,7 +2357,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); - if (ret) { + if (unlikely(ret)) { btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; } @@ -2429,7 +2429,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* @@ -2443,11 +2443,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, U64_MAX); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* @@ -2459,7 +2459,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) WARN_ON(cur_trans != trans->transaction); ret = commit_fs_roots(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* commit_fs_roots gets rid of all the tree log roots, it is now @@ -2472,11 +2472,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); - if (ret < 0) + if (unlikely(ret < 0)) goto unlock_reloc; ret = commit_cowonly_roots(trans); - if (ret) + if (unlikely(ret)) goto unlock_reloc; /* @@ -2562,7 +2562,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * to go about their business */ mutex_unlock(&fs_info->tree_log_mutex); - if (ret) + if (unlikely(ret)) goto scrub_continue; update_commit_stats(fs_info); @@ -2575,7 +2575,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); ret = btrfs_finish_extent_commit(trans); - if (ret) + if (unlikely(ret)) goto scrub_continue; if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) From 7d7608cc9ad8acc0d03dd85175558cb2cd634f86 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 Dec 2025 13:15:42 +0000 Subject: [PATCH 036/137] btrfs: move unlikely checks around btrfs_is_shutdown() into the helper Instead of surrounding every caller of btrfs_is_shutdown() with unlikely, move the unlikely into the helper itself, like we do in other places in btrfs and is common in the kernel outside btrfs too. Also make the fs_info argument of btrfs_is_shutdown() const. On a x86_84 box using gcc 14.2.0-19 from Debian, this resulted in a slight reduction of the module's text size. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939044 172568 15592 2127204 207564 fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1938876 172568 15592 2127036 2074bc fs/btrfs/btrfs.ko Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 12 ++++++------ fs/btrfs/fs.h | 4 ++-- fs/btrfs/inode.c | 6 +++--- fs/btrfs/ioctl.c | 2 +- fs/btrfs/reflink.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 69edf5f44bda..5d47cff5af42 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1437,7 +1437,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - if (unlikely(btrfs_is_shutdown(inode->root->fs_info))) + if (btrfs_is_shutdown(inode->root->fs_info)) return -EIO; /* * If the fs flips readonly due to some impossible error, although we @@ -2042,7 +2042,7 @@ static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp)))) return -EIO; if (!mapping->a_ops->read_folio) return -ENOEXEC; @@ -3113,7 +3113,7 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + if (btrfs_is_shutdown(inode_to_fs_info(inode))) return -EIO; /* Do not allow fallocate in ZONED mode */ @@ -3807,7 +3807,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + if (btrfs_is_shutdown(inode_to_fs_info(inode))) return -EIO; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; @@ -3822,7 +3822,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp)))) return -EIO; if (iocb->ki_flags & IOCB_DIRECT) { @@ -3839,7 +3839,7 @@ static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in)))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 458a24206935..428b2b239189 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -1154,9 +1154,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) -static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) +static inline bool btrfs_is_shutdown(const struct btrfs_fs_info *fs_info) { - return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); + return unlikely(test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)); } static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b95dab8ac8a1..247b373bf5cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -901,7 +901,7 @@ static void compress_file_range(struct btrfs_work *work) int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; - if (unlikely(btrfs_is_shutdown(fs_info))) + if (btrfs_is_shutdown(fs_info)) goto cleanup_and_bail_uncompressed; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -1445,7 +1445,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, unsigned long page_ops; int ret = 0; - if (unlikely(btrfs_is_shutdown(fs_info))) { + if (btrfs_is_shutdown(fs_info)) { ret = -EIO; goto out_unlock; } @@ -2111,7 +2111,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); - if (unlikely(btrfs_is_shutdown(fs_info))) { + if (btrfs_is_shutdown(fs_info)) { ret = -EIO; goto error; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index acb484546b1d..d9e7dd317670 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5000,7 +5000,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file)))) return -EIO; switch (cmd->cmd_op) { diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 58dc3e5057ce..ab4ce56d69ee 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -873,7 +873,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, bool same_inode = dst_inode == src_inode; int ret; - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))) return -EIO; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) From 8d206b0c21ef9b230627ff742170130912a1db3a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 Dec 2025 15:53:59 +0000 Subject: [PATCH 037/137] btrfs: avoid transaction commit on error in insert_balance_item() There's no point in committing the transaction if we failed to insert the balance item, since we haven't done anything else after we started/joined the transaction. Also stop using two variables for tracking the return value and use only 'ret'. Reviewed-by: Daniel Vacek Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a89243a57fde..a541cd30c6b8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3644,7 +3644,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; - int ret, err; + int ret; path = btrfs_alloc_path(); if (!path) @@ -3679,9 +3679,11 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_flags(leaf, item, bctl->flags); out: btrfs_free_path(path); - err = btrfs_commit_transaction(trans); - if (err && !ret) - ret = err; + if (ret == 0) + ret = btrfs_commit_transaction(trans); + else + btrfs_end_transaction(trans); + return ret; } From fdb945f6659374c9628509517901d49035b0e984 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 19 Dec 2025 15:36:49 +0800 Subject: [PATCH 038/137] btrfs: simplify check for zoned NODATASUM writes in btrfs_submit_chunk() This function already dereferences 'inode' multiple times earlier, making the additional NULL check at line 840 redundant since the function would have crashed already if inode were NULL. After commit 81cea6cd7041 ("btrfs: remove btrfs_bio::fs_info by extracting it from btrfs_bio::inode"), the btrfs_bio::inode field is mandatory for all btrfs_bio allocations and is guaranteed to be non-NULL. Simplify the condition for allocating dummy checksums for zoned NODATASUM data by removing the unnecessary 'inode &&' check. Reviewed-by: Johannes Thumshirn Signed-off-by: Zhen Ni Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index e4d382d3a7ae..a12446aa0fbf 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -836,8 +836,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (status) goto fail; } else if (bbio->can_use_append || - (btrfs_is_zoned(fs_info) && inode && - inode->flags & BTRFS_INODE_NODATASUM)) { + (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); status = errno_to_blk_status(ret); if (status) From d1a020a8d72731b80a01e1abdb8ff965ee278f69 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 20 Dec 2025 13:07:40 +1030 Subject: [PATCH 039/137] btrfs: add mount time auto fix for orphan fst entries [BUG] Before btrfs-progs v6.16.1 release, mkfs.btrfs can leave free space tree entries for deleted chunks: # mkfs.btrfs -f -O fst $dev # btrfs ins dump-tree -t chunk $dev btrfs-progs v6.16 chunk tree leaf 22036480 items 4 free space 15781 generation 8 owner CHUNK_TREE leaf 22036480 flags 0x1(WRITTEN) backref revision 1 item 0 key (DEV_ITEMS DEV_ITEM 1) itemoff 16185 itemsize 98 item 1 key (FIRST_CHUNK_TREE CHUNK_ITEM 13631488) itemoff 16105 itemsize 80 ^^^ The first chunk is at 13631488 item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 22020096) itemoff 15993 itemsize 112 item 3 key (FIRST_CHUNK_TREE CHUNK_ITEM 30408704) itemoff 15881 itemsize 112 # btrfs ins dump-tree -t free-space-tree $dev btrfs-progs v6.16 free space tree key (FREE_SPACE_TREE ROOT_ITEM 0) leaf 30556160 items 13 free space 15918 generation 8 owner FREE_SPACE_TREE leaf 30556160 flags 0x1(WRITTEN) backref revision 1 item 0 key (1048576 FREE_SPACE_INFO 4194304) itemoff 16275 itemsize 8 free space info extent count 1 flags 0 item 1 key (1048576 FREE_SPACE_EXTENT 4194304) itemoff 16275 itemsize 0 free space extent item 2 key (5242880 FREE_SPACE_INFO 8388608) itemoff 16267 itemsize 8 free space info extent count 1 flags 0 item 3 key (5242880 FREE_SPACE_EXTENT 8388608) itemoff 16267 itemsize 0 free space extent ^^^ Above 4 items are all before the first chunk. item 4 key (13631488 FREE_SPACE_INFO 8388608) itemoff 16259 itemsize 8 free space info extent count 1 flags 0 item 5 key (13631488 FREE_SPACE_EXTENT 8388608) itemoff 16259 itemsize 0 free space extent ... This can trigger btrfs check errors. [CAUSE] It's a bug in free space tree implementation of btrfs-progs, which doesn't delete involved fst entries for the to-be-deleted chunk/block group. [ENHANCEMENT] The mostly common fix is to clear the space cache and rebuild it, but that requires a ro->rw remount which may not be possible for rootfs, and also relies on users to use "clear_cache" mount option manually. Here introduce a kernel fix for it, which will delete any entries that is before the first block group automatically at the first RW mount. For filesystems without such problem, the overhead is just a single tree search and no modification to the free space tree, thus the overhead should be minimal. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 ++++ fs/btrfs/free-space-tree.c | 103 +++++++++++++++++++++++++++++++++++++ fs/btrfs/free-space-tree.h | 1 + 3 files changed, 113 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7dea5615bd8f..9bb5d65219a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3013,6 +3013,15 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } + /* + * Before btrfs-progs v6.16.1 mkfs.btrfs can leave free space entries + * for deleted temporary chunks. Delete them if they exist. + */ + ret = btrfs_delete_orphan_free_space_entries(fs_info); + if (ret < 0) { + btrfs_err(fs_info, "failed to delete orphan free space tree entries: %d", ret); + goto out; + } /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a66ce9ef3aff..776b6467dfad 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1710,3 +1710,106 @@ int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) else return load_free_space_extents(caching_ctl, path, extent_count); } + +static int delete_orphan_free_space_entries(struct btrfs_root *fst_root, + struct btrfs_path *path, + u64 first_bg_bytenr) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(fst_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + while (true) { + struct btrfs_key key = { 0 }; + int i; + + ret = btrfs_search_slot(trans, fst_root, &key, path, -1, 1); + if (ret < 0) + break; + ASSERT(ret > 0); + ret = 0; + for (i = 0; i < btrfs_header_nritems(path->nodes[0]); i++) { + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid >= first_bg_bytenr) { + /* + * Only break the for() loop and continue to + * delete items. + */ + break; + } + } + /* No items to delete, finished. */ + if (i == 0) + break; + + ret = btrfs_del_items(trans, fst_root, path, 0, i); + if (ret < 0) + break; + btrfs_release_path(path); + } + btrfs_release_path(path); + btrfs_end_transaction(trans); + if (ret == 0) + btrfs_info(fst_root->fs_info, "deleted orphan free space tree entries"); + return ret; +} + +/* Remove any free space entry before the first block group. */ +int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info) +{ + BTRFS_PATH_AUTO_RELEASE(path); + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *root; + struct btrfs_block_group *bg; + u64 first_bg_bytenr; + int ret; + + /* + * Extent tree v2 has multiple global roots based on the block group. + * This means we cannot easily grab the global free space tree and locate + * orphan items. Furthermore this is still experimental, all users + * should use the latest btrfs-progs anyway. + */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + root = btrfs_global_root(fs_info, &key); + if (!root) + return 0; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + bg = btrfs_lookup_first_block_group(fs_info, 0); + if (unlikely(!bg)) { + btrfs_err(fs_info, "no block group found"); + return -EUCLEAN; + } + first_bg_bytenr = bg->start; + btrfs_put_block_group(bg); + + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + return ret; + /* There should not be an all-zero key in fst. */ + ASSERT(ret > 0); + + /* Empty free space tree. */ + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + return 0; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid >= first_bg_bytenr) + return 0; + btrfs_release_path(&path); + return delete_orphan_free_space_entries(root, &path, first_bg_bytenr); +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 3d9a5d4477fc..ca04fc7cf29e 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -35,6 +35,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); +int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * From d6f6109fe4b32878df8a5d4143a055ea680b1f84 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 30 Dec 2025 17:32:45 +0100 Subject: [PATCH 040/137] btrfs: update outdated comment in __add_block_group_free_space() The function add_block_group_free_space() was renamed btrfs_add_block_group_free_space() by commit 6fc5ef782988 ("btrfs: add btrfs prefix to free space tree exported functions"). Update the comment accordingly. Do some reorganization of the next few lines to keep the comment within 80 characters. Signed-off-by: Julia Lawall Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 776b6467dfad..ac092898130f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1396,9 +1396,9 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, * can use multiple transactions, every time btrfs_end_transaction() is * called at btrfs_rebuild_free_space_tree() we finish the creation of * new block groups by calling btrfs_create_pending_block_groups(), and - * that in turn calls us, through add_block_group_free_space(), to add - * a free space info item and a free space extent item for the block - * group. + * that in turn calls us, through btrfs_add_block_group_free_space(), + * to add a free space info item and a free space extent item for the + * block group. * * Then later btrfs_rebuild_free_space_tree() may find such new block * groups and processes them with populate_free_space_tree(), which can From 59615e2c1f63dfd65f74c166e803873d3362e51a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 6 Jan 2026 13:20:30 +1030 Subject: [PATCH 041/137] btrfs: reject single block sized compression early Currently for an inode that needs compression, even if there is a delalloc range that is single fs block sized and can not be inlined, we will still go through the compression path. Then inside compress_file_range(), we have one extra check to reject single block sized range, and fall back to regular uncompressed write. This rejection is in fact a little too late, we have already allocated memory to async_chunk, delayed the submission, just to fallback to the same uncompressed write. Change the behavior to reject such cases earlier at inode_need_compress(), so for such single block sized range we won't even bother trying to go through compress path. And since the inline small block check is inside inode_need_compress() and compress_file_range() also calls that function, we no longer need a dedicate check inside compress_file_range(). Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 247b373bf5cf..be47aa58e944 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -816,6 +816,13 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } + /* + * If the delalloc range is only one fs block and can not be inlined, + * do not even bother try compression, as there will be no space saving + * and will always fallback to regular write later. + */ + if (start != 0 && end + 1 - start <= fs_info->sectorsize) + return 0; /* Defrag ioctl takes precedence over mount options and properties. */ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) return 0; @@ -953,18 +960,7 @@ static void compress_file_range(struct btrfs_work *work) if (actual_end <= start) goto cleanup_and_bail_uncompressed; - total_compressed = actual_end - start; - - /* - * Skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it doesn't save disk space at all. - */ - if (total_compressed <= blocksize && - (start > 0 || end + 1 < inode->disk_i_size)) - goto cleanup_and_bail_uncompressed; - - total_compressed = min_t(unsigned long, total_compressed, - BTRFS_MAX_UNCOMPRESSED); + total_compressed = min_t(unsigned long, actual_end - start, BTRFS_MAX_UNCOMPRESSED); total_in = 0; ret = 0; From e582f22030a7a59d4d0bb2881371df259d4a2ecd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 6 Jan 2026 13:30:28 +0100 Subject: [PATCH 042/137] btrfs: split btrfs_fs_closing() and change return type to bool There are two tests in btrfs_fs_closing() but checking the BTRFS_FS_CLOSING_DONE bit is done only in one place load_extent_tree_free(). As this is an inline we can reduce size of the generated code. The types can be also changed to bool as this becomes a simple condition. text data bss dec hex filename 1674006 146704 15560 1836270 1c04ee pre/btrfs.ko 1673772 146704 15560 1836036 1c0404 post/btrfs.ko DELTA: -234 Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/fs.h | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e417aba4c4c7..a1119f06b6d1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -761,7 +761,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) nritems = btrfs_header_nritems(leaf); while (1) { - if (btrfs_fs_closing(fs_info) > 1) { + if (btrfs_fs_closing_done(fs_info)) { last = (u64)-1; break; } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 428b2b239189..e3e5e52e97a2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -1118,15 +1118,17 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info) +static inline bool btrfs_fs_closing(const struct btrfs_fs_info *fs_info) { - /* Do it this way so we only ever do one test_bit in the normal case. */ - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) - return 2; - return 1; - } - return 0; + return unlikely(test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)); +} + +static inline bool btrfs_fs_closing_done(const struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_closing(fs_info) && test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return true; + + return false; } /* From ae23fee41b36a39f8e163580fe273ca3f88f2413 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 8 Jan 2026 14:31:03 +1030 Subject: [PATCH 043/137] btrfs: remove experimental offload csum mode The offload csum mode was introduced to allow developers to compare the performance of generating checksum for data writes at different timings: - During btrfs_submit_chunk() This is the most common one, if any of the following condition is met we go this path: * The csum is fast For now it's CRC32C and xxhash. * It's a synchronous write * Zoned - Delay the checksum generation to a workqueue However since commit dd57c78aec39 ("btrfs: introduce btrfs_bio::async_csum") we no longer need to bother any of them. As if it's an experimental build, async checksum generation at the background will be faster anyway. And if not an experimental build, we won't even have the offload csum mode support. Considering the async csum will be the new default, let's remove the offload csum mode code. There will be no impact to end users, and offload csum mode is still under experimental features. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 3 --- fs/btrfs/bio.c | 5 ----- fs/btrfs/sysfs.c | 44 -------------------------------------------- fs/btrfs/volumes.h | 22 ---------------------- 4 files changed, 74 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d88eb836a193..423122786a93 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -104,9 +104,6 @@ config BTRFS_EXPERIMENTAL - send stream protocol v3 - fs-verity support - - checksum offload mode - sysfs knob to affect when checksums are - calculated (at IO time, or in a thread) - - raid-stripe-tree - additional mapping of extents to devices to support RAID1* profiles on zoned devices, RAID56 not yet supported diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index a12446aa0fbf..d46f39996469 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -665,11 +665,6 @@ static bool should_async_write(struct btrfs_bio *bbio) bool auto_csum_mode = true; #ifdef CONFIG_BTRFS_EXPERIMENTAL - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); - - if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) - return true; /* * Write bios will calculate checksum and submit bio at the same time. * Unless explicitly required don't offload serial csum calculate and bio diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f0974f4c0ae4..ebd6d1d6778b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1538,47 +1538,6 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_EXPERIMENTAL -static ssize_t btrfs_offload_csum_show(struct kobject *kobj, - struct kobj_attribute *a, char *buf) -{ - struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - - switch (READ_ONCE(fs_devices->offload_csum_mode)) { - case BTRFS_OFFLOAD_CSUM_AUTO: - return sysfs_emit(buf, "auto\n"); - case BTRFS_OFFLOAD_CSUM_FORCE_ON: - return sysfs_emit(buf, "1\n"); - case BTRFS_OFFLOAD_CSUM_FORCE_OFF: - return sysfs_emit(buf, "0\n"); - default: - WARN_ON(1); - return -EINVAL; - } -} - -static ssize_t btrfs_offload_csum_store(struct kobject *kobj, - struct kobj_attribute *a, const char *buf, - size_t len) -{ - struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int ret; - bool val; - - ret = kstrtobool(buf, &val); - if (ret == 0) - WRITE_ONCE(fs_devices->offload_csum_mode, - val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); - else if (ret == -EINVAL && sysfs_streq(buf, "auto")) - WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); - else - return -EINVAL; - - return len; -} -BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); -#endif - /* * Per-filesystem information and stats. * @@ -1598,9 +1557,6 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_EXPERIMENTAL - BTRFS_ATTR_PTR(, offload_csum), -#endif NULL, }; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f20abeb16bce..262526657cdf 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -321,25 +321,6 @@ enum btrfs_read_policy { BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_EXPERIMENTAL -/* - * Checksum mode - offload it to workqueues or do it synchronously in - * btrfs_submit_chunk(). - */ -enum btrfs_offload_csum_mode { - /* - * Choose offloading checksum or do it synchronously automatically. - * Do it synchronously if the checksum is fast, or offload to workqueues - * otherwise. - */ - BTRFS_OFFLOAD_CSUM_AUTO, - /* Always offload checksum to workqueues. */ - BTRFS_OFFLOAD_CSUM_FORCE_ON, - /* Never offload checksum to workqueues. */ - BTRFS_OFFLOAD_CSUM_FORCE_OFF, -}; -#endif - struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -466,9 +447,6 @@ struct btrfs_fs_devices { /* Device to be used for reading in case of RAID1. */ u64 read_devid; - - /* Checksum mode - offload it or do it synchronously. */ - enum btrfs_offload_csum_mode offload_csum_mode; #endif }; From 8ecf596ed822d481d1ad0bb589a7d5b9a7e82898 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 9 Jan 2026 12:09:18 +0000 Subject: [PATCH 044/137] btrfs: update comment for delalloc flush and oe wait in btrfs_clone_files() Make the comment more detailed about why we need to flush delalloc and wait for ordered extent completion before attempting to invalidate the page cache. Reviewed-by: Qu Wenruo Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ab4ce56d69ee..314cb95ba846 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -754,8 +754,13 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, /* * We may have copied an inline extent into a page of the destination - * range, so wait for writeback to complete before invalidating pages - * from the page cache. This is a rare case. + * range. So flush delalloc and wait for ordered extent completion. + * This is to ensure the invalidation below does not fail, as if for + * example it finds a dirty folio, our folio release callback + * (btrfs_release_folio()) returns false, which makes the invalidation + * return an -EBUSY error. We can't ignore such failures since they + * could come from some range other than the copied inline extent's + * destination range and we have no way to know that. */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); if (ret < 0) From 4681dbcfdc33d6627193425222819577a89857cc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 10 Jan 2026 10:08:28 +1030 Subject: [PATCH 045/137] btrfs: shrink the size of btrfs_device There are two main causes of holes inside btrfs_device: - The single bytes member of last_flush_error Not only it's a single byte member, but we never really care about the exact error number. - The @devt member Which is placed between two u64 members. Shrink the size of btrfs_device by: - Use a single bit flag for flush error Use BTRFS_DEV_STATE_FLUSH_FAILED so that we no longer need that dedicated member. - Move @devt to the hole after dev_stat_values[] This reduces the size of btrfs_device from 528 to exact 512 bytes for x86_64. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/volumes.c | 4 ++-- fs/btrfs/volumes.h | 13 +++++++------ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9bb5d65219a7..faa1c2c20ecd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3834,7 +3834,7 @@ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; - device->last_flush_error = BLK_STS_OK; + clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); @@ -3859,7 +3859,7 @@ static bool wait_dev_flush(struct btrfs_device *device) wait_for_completion_io(&device->flush_wait); if (bio->bi_status) { - device->last_flush_error = bio->bi_status; + set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); return true; } @@ -3909,7 +3909,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) } /* - * Checks last_flush_error of disks in order to determine the device + * Checks flush failure of disks in order to determine the device * state. */ if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL))) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a541cd30c6b8..844657f23e7d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1169,7 +1169,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) * any transaction and set the error state, guaranteeing no commits of * unsafe super blocks. */ - device->last_flush_error = 0; + clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); /* Verify the device is back in a pristine state */ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); @@ -7375,7 +7375,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - dev->last_flush_error) + test_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &dev->dev_state)) missing++; else if (failing_dev && failing_dev == dev) missing++; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 262526657cdf..59347a4bb185 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -99,6 +99,7 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_REPLACE_TGT (3) #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) +#define BTRFS_DEV_STATE_FLUSH_FAILED (6) /* Set when the device item is found in chunk tree, used to catch unexpected registered device. */ #define BTRFS_DEV_STATE_ITEM_FOUND (7) @@ -125,13 +126,7 @@ struct btrfs_device { struct btrfs_zoned_device_info *zone_info; - /* - * Device's major-minor number. Must be set even if the device is not - * opened (bdev == NULL), unless the device is missing. - */ - dev_t devt; unsigned long dev_state; - blk_status_t last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED seqcount_t data_seqcount; @@ -195,6 +190,12 @@ struct btrfs_device { atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; + struct extent_io_tree alloc_state; struct completion kobj_unregister; From 23d4f616cb879de3ffea9f686ac60b44740beacb Mon Sep 17 00:00:00 2001 From: jinbaohong Date: Wed, 14 Jan 2026 01:18:15 +0000 Subject: [PATCH 046/137] btrfs: use READA_FORWARD_ALWAYS for device extent verification btrfs_verify_dev_extents() iterates through the entire device tree during mount to verify dev extents against chunks. Since this function scans the whole tree, READA_FORWARD_ALWAYS is more appropriate than READA_FORWARD. While the device tree is typically small (a few hundred KB even for multi-TB filesystems), using the correct readahead mode for full-tree iteration is more consistent with the intended usage. Signed-off-by: robbieko Signed-off-by: jinbaohong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 844657f23e7d..c4be17fcb87a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -8026,7 +8026,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - path->reada = READA_FORWARD; + path->reada = READA_FORWARD_ALWAYS; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ret; From c7d1d4ff56744074e005771aff193b927392d51f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2026 16:37:26 +0000 Subject: [PATCH 047/137] btrfs: don't BUG() on unexpected delayed ref type in run_one_delayed_ref() There is no need to BUG(), we can just return an error and log an error message. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1dcd69fe97ed..57ffe9b3b954 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1761,32 +1761,36 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; if (TRANS_ABORTED(trans)) { if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes); - free_head_ref_squota_rsv(trans->fs_info, href); + free_head_ref_squota_rsv(fs_info, href); } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) + node->type == BTRFS_SHARED_BLOCK_REF_KEY) { ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); - else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) + } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) { ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); - else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + } else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) { ret = 0; - else - BUG(); + } else { + ret = -EUCLEAN; + btrfs_err(fs_info, "unexpected delayed ref node type: %u", node->type); + } + if (ret && insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes); if (ret < 0) - btrfs_err(trans->fs_info, + btrfs_err(fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", node->bytenr, node->num_bytes, node->type, node->action, node->ref_mod, ret); From 271cbe76354e83e56f8d81acad2dba1adb17a896 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2026 16:39:00 +0000 Subject: [PATCH 048/137] btrfs: remove unnecessary else branch in run_one_delayed_ref() There is no need for an else branch to deal with an unexpected delayed ref type. We can just change the previous branch to deal with this by checking if the ref type is not BTRFS_EXTENT_OWNER_REF_KEY, since that branch is useless as it only sets 'ret' to zero when it's already zero. So merge the two branches. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 57ffe9b3b954..b3a26b61f937 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1780,9 +1780,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->type == BTRFS_SHARED_DATA_REF_KEY) { ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); - } else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) { - ret = 0; - } else { + } else if (unlikely(node->type != BTRFS_EXTENT_OWNER_REF_KEY)) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected delayed ref node type: %u", node->type); } From b322fa5ff1320430d9a8349cb57770a47399b690 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jan 2026 16:42:57 +0000 Subject: [PATCH 049/137] btrfs: tag as unlikely error handling in run_one_delayed_ref() We don't expect to get errors unless we have a corrupted fs, bad RAM or a bug. So tag the error handling as unlikely. This slightly reduces the module's text size on x86_64 using gcc 14.2.0-19 from Debian. Before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939458 172512 15592 2127562 2076ca fs/btrfs/btrfs.ko After this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939398 172512 15592 2127502 20768e fs/btrfs/btrfs.ko Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b3a26b61f937..5e3877a42ee6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1785,13 +1785,15 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, btrfs_err(fs_info, "unexpected delayed ref node type: %u", node->type); } - if (ret && insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes); - if (ret < 0) + if (unlikely(ret)) { + if (insert_reserved) + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); btrfs_err(fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", node->bytenr, node->num_bytes, node->type, node->action, node->ref_mod, ret); + } + return ret; } From c208aa0ef655bb9a3c379510802cadd0df512f0a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 15 Jan 2026 21:17:35 +0000 Subject: [PATCH 050/137] btrfs: add and use helper to compute the available space for a block group We have currently three places that compute how much available space a block group has. Add a helper function for this and use it in those places. Reviewed-by: Boris Burkov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 9 ++------- fs/btrfs/block-group.h | 8 ++++++++ fs/btrfs/space-info.c | 3 +-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a1119f06b6d1..9d64cc60a42b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1376,8 +1376,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) goto out; } - num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->zone_unusable - cache->used; + num_bytes = btrfs_block_group_available_space(cache); /* * Data never overcommits, even in mixed mode, so do just the straight @@ -3089,7 +3088,6 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) { struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; BUG_ON(!cache->ro); @@ -3105,10 +3103,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } - num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - - cache->zone_unusable - cache->used; - sinfo->bytes_readonly -= num_bytes; + sinfo->bytes_readonly -= btrfs_block_group_available_space(cache); list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5f933455118c..cd2d53d5b315 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -295,6 +295,14 @@ static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); } +static inline u64 btrfs_block_group_available_space(const struct btrfs_block_group *bg) +{ + lockdep_assert_held(&bg->lock); + + return (bg->length - bg->used - bg->pinned - bg->reserved - + bg->bytes_super - bg->zone_unusable); +} + #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 857e4fd2c77e..1d76242f5e0d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -656,8 +656,7 @@ void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, u64 avail; spin_lock(&cache->lock); - avail = cache->length - cache->used - cache->pinned - - cache->reserved - cache->bytes_super - cache->zone_unusable; + avail = btrfs_block_group_available_space(cache); btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", cache->start, cache->length, cache->used, cache->pinned, From ef6a31d035a1000071dc4846aebd02ad081db9e4 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:01 +0000 Subject: [PATCH 051/137] btrfs: add definitions and constants for remap-tree Add an incompat flag for the new remap-tree feature, and the constants and definitions needed to support it. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 4 ++++ fs/btrfs/locking.c | 1 + fs/btrfs/sysfs.c | 3 +++ fs/btrfs/tree-checker.c | 6 ++---- fs/btrfs/tree-checker.h | 5 +++++ fs/btrfs/volumes.c | 1 + include/uapi/linux/btrfs.h | 1 + include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++ 8 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 78721412951c..09cdd6bfddf5 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -1010,6 +1010,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); +BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap_item, address, 64); +BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap_item, + address, 64); + /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0035851d72b0..e3df5ca0b552 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, + { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap") }, { .id = 0, DEFINE_NAME("tree") }, }; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ebd6d1d6778b..8834a1dd499c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -299,6 +299,8 @@ BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); +/* Remove once support for remap tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -331,6 +333,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), + BTRFS_FEAT_ATTR_PTR(remap_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c21c21adf61e..aedc208a95b8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -913,12 +913,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } - if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK))) { + if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) { chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); + type & ~BTRFS_BLOCK_GROUP_VALID); return -EUCLEAN; } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index eb201f4ec3c7..833e2fd989eb 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -57,6 +57,11 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; + +#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \ + BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_BLOCK_GROUP_REMAPPED) + /* * Exported simply for btrfs-progs which wants to have the * btrfs_tree_block_status return codes. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c4be17fcb87a..d2b7352eb7cb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -231,6 +231,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e8fd92789423..9165154a274d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16) +#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17) struct btrfs_ioctl_feature_flags { __u64 compat_flags; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index fc29d273845d..f011d34cb699 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -76,6 +76,9 @@ /* Tracks RAID stripes in block groups. */ #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL +/* Holds details of remapped addresses after relocation. */ +#define BTRFS_REMAP_TREE_OBJECTID 13ULL + /* device stats in the device tree */ #define BTRFS_DEV_STATS_OBJECTID 0ULL @@ -282,6 +285,10 @@ #define BTRFS_RAID_STRIPE_KEY 230 +#define BTRFS_IDENTITY_REMAP_KEY 234 +#define BTRFS_REMAP_KEY 235 +#define BTRFS_REMAP_BACKREF_KEY 236 + /* * Records the overall state of the qgroups. * There's only one instance of this key present, @@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) +#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -1323,4 +1331,13 @@ struct btrfs_verity_descriptor_item { __u8 encryption; } __attribute__ ((__packed__)); +/* + * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives + * the address that the start of the range will get remapped to. This + * structure is also shared by BTRFS_REMAP_BACKREF_KEY. + */ +struct btrfs_remap_item { + __le64 address; +} __attribute__ ((__packed__)); + #endif /* _BTRFS_CTREE_H_ */ From 0b4d29fa98ca1a49c4498353253f857573871ba0 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:02 +0000 Subject: [PATCH 052/137] btrfs: add METADATA_REMAP chunk type Add a new METADATA_REMAP chunk type, which is a metadata chunk that holds the remap tree. This is needed for bootstrapping purposes: the remap tree can't itself be remapped, and must be relocated the existing way, by COWing every leaf. The remap tree can't go in the SYSTEM chunk as space there is limited, because a copy of the chunk item gets placed in the superblock. The changes in fs/btrfs/volumes.h are because we're adding a new block group type bit after the profile bits, and so can no longer rely on the const_ilog2 trick. The sizing to 32MB per chunk, matching the SYSTEM chunk, is an estimate here, we can adjust it later if it proves to be too big or too small. This works out to be ~500,000 remap items, which for a 4KB block size covers ~2GB of remapped data in the worst case and ~500TB in the best case. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 8 ++++++++ fs/btrfs/block-rsv.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/fs.h | 2 ++ fs/btrfs/space-info.c | 13 ++++++++++++- fs/btrfs/sysfs.c | 2 ++ fs/btrfs/tree-checker.c | 13 +++++++++++-- fs/btrfs/volumes.c | 3 +++ fs/btrfs/volumes.h | 10 +++++++++- include/uapi/linux/btrfs_tree.h | 4 +++- 10 files changed, 52 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 96cf7a162987..e823230c09b7 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_TREE_LOG_OBJECTID: root->block_rsv = &fs_info->treelog_rsv; break; + case BTRFS_REMAP_TREE_OBJECTID: + root->block_rsv = &fs_info->remap_block_rsv; + break; default: root->block_rsv = NULL; break; @@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP); + fs_info->remap_block_rsv.space_info = space_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; @@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->remap_block_rsv.size > 0); + WARN_ON(fs_info->remap_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 79ae9d05cd91..8359fb96bc3c 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -22,6 +22,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_REMAP, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, BTRFS_BLOCK_RSV_TREELOG, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index faa1c2c20ecd..922e69038d81 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2751,6 +2751,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP); btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index e3e5e52e97a2..195428ecfd75 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -509,6 +509,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* Block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for remap tree. */ + struct btrfs_block_rsv remap_block_rsv; /* Block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 1d76242f5e0d..2c9cf1ab232b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) return SZ_32M; /* Handle BTRFS_BLOCK_GROUP_METADATA */ @@ -348,6 +348,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + goto out; } else { flags = BTRFS_BLOCK_GROUP_METADATA; ret = create_space_info(fs_info, flags); @@ -356,7 +358,15 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) flags = BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); + if (ret) + goto out; } + + if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; + ret = create_space_info(fs_info, flags); + } + out: return ret; } @@ -611,6 +621,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, remap_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8834a1dd499c..27bfb7b55ec4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1929,6 +1929,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info) case BTRFS_BLOCK_GROUP_SYSTEM: ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; + case BTRFS_BLOCK_GROUP_METADATA_REMAP: + return "metadata-remap"; default: WARN_ON(1); return "invalid-combination"; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index aedc208a95b8..a6c158cd8fcd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -748,17 +748,26 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP && + !btrfs_fs_incompat(fs_info, REMAP_TREE))) { + block_group_err(leaf, slot, +"invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag", + flags); + return -EUCLEAN; + } + type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && type != BTRFS_BLOCK_GROUP_METADATA && type != BTRFS_BLOCK_GROUP_SYSTEM && + type != BTRFS_BLOCK_GROUP_METADATA_REMAP && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d2b7352eb7cb..eda6505f3ee5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -231,6 +231,9 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + /* Block groups containing the remap tree. */ + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap"); + /* Block group that has been remapped. */ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59347a4bb185..e4b3cb50f94a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,6 @@ static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ @@ -80,6 +79,15 @@ enum btrfs_raid_types { BTRFS_NR_RAID_TYPES }; +static_assert(BTRFS_RAID_RAID0 == 1); +static_assert(BTRFS_RAID_RAID1 == 2); +static_assert(BTRFS_RAID_DUP == 3); +static_assert(BTRFS_RAID_RAID10 == 4); +static_assert(BTRFS_RAID_RAID5 == 5); +static_assert(BTRFS_RAID_RAID6 == 6); +static_assert(BTRFS_RAID_RAID1C3 == 7); +static_assert(BTRFS_RAID_RAID1C4 == 8); + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index f011d34cb699..76578426671c 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1169,12 +1169,14 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) #define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) +#define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) + BTRFS_BLOCK_GROUP_METADATA | \ + BTRFS_BLOCK_GROUP_METADATA_REMAP) #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ From c3d6dda60c9da79a108592b14560e326384dbf4e Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:03 +0000 Subject: [PATCH 053/137] btrfs: allow remapped chunks to have zero stripes When a chunk has been fully remapped, we are going to set its num_stripes to 0, as it will no longer represent a physical location on disk. Change tree-checker to allow for this, and fix read_one_chunk() to avoid a divide by zero. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 55 ++++++++++++++++++++++++++--------------- fs/btrfs/volumes.c | 7 +++++- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a6c158cd8fcd..ead2e1e2a0bb 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -816,6 +816,32 @@ static void chunk_err(const struct btrfs_fs_info *fs_info, va_end(args); } +static bool valid_stripe_count(u64 profile, u16 num_stripes, u16 sub_stripes) +{ + switch (profile) { + case BTRFS_BLOCK_GROUP_RAID0: + return true; + case BTRFS_BLOCK_GROUP_RAID10: + return sub_stripes == btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes; + case BTRFS_BLOCK_GROUP_RAID1: + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1].devs_min; + case BTRFS_BLOCK_GROUP_RAID1C3: + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min; + case BTRFS_BLOCK_GROUP_RAID1C4: + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min; + case BTRFS_BLOCK_GROUP_RAID5: + return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID5].devs_min; + case BTRFS_BLOCK_GROUP_RAID6: + return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID6].devs_min; + case BTRFS_BLOCK_GROUP_DUP: + return num_stripes == btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes; + case 0: /* SINGLE */ + return num_stripes == btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes; + default: + BUG(); + } +} + /* * The common chunk check which could also work on super block sys chunk array. * @@ -839,6 +865,7 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, u64 features; u32 chunk_sector_size; bool mixed = false; + bool remapped; int raid_index; int nparity; int ncopies; @@ -861,13 +888,14 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; + remapped = (type & BTRFS_BLOCK_GROUP_REMAPPED); - if (unlikely(!num_stripes)) { + if (unlikely(!remapped && !num_stripes)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } - if (unlikely(num_stripes < ncopies)) { + if (unlikely(num_stripes != 0 && num_stripes < ncopies)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); @@ -965,22 +993,9 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, } } - if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && - sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || - (type & BTRFS_BLOCK_GROUP_RAID1 && - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID1C3 && - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID1C4 && - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID5 && - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || - (type & BTRFS_BLOCK_GROUP_RAID6 && - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || - (type & BTRFS_BLOCK_GROUP_DUP && - num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { + if (!remapped && + !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK, + num_stripes, sub_stripes)) { chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, @@ -1004,11 +1019,11 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; - if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { + if (unlikely(btrfs_item_size(leaf, slot) < offsetof(struct btrfs_chunk, stripe))) { chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), - sizeof(struct btrfs_chunk), + offsetof(struct btrfs_chunk, stripe), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eda6505f3ee5..2c9b55f66cc3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7047,7 +7047,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - map->stripe_size = btrfs_calc_stripe_length(map); + + if (num_stripes > 0) + map->stripe_size = btrfs_calc_stripe_length(map); + else + map->stripe_size = 0; + for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); From 76377db18a8fb96511b09643e407fe3f9b0a9357 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:04 +0000 Subject: [PATCH 054/137] btrfs: remove remapped block groups from the free-space-tree No new allocations can be done from block groups that have the REMAPPED flag set, so there's no value in their having entries in the free-space tree. Prevent a search through the free-space tree being scheduled for such a block group, and prevent any additions to the in-memory free-space tree. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 19 ++++++++++++++++--- fs/btrfs/free-space-cache.c | 3 +++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9d64cc60a42b..a2f95ac5a8d0 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -933,6 +933,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) if (btrfs_is_zoned(fs_info)) return 0; + /* + * No allocations can be done from remapped block groups, so they have + * no entries in the free-space tree. + */ + if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; @@ -1246,10 +1253,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * deletes the block group item from the extent tree, allowing for * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). + * + * If the REMAPPED flag has been set the block group's free space + * has already been removed, so we can skip the call to + * btrfs_remove_block_group_free_space(). */ - ret = btrfs_remove_block_group_free_space(trans, block_group); - if (ret) - goto out; + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { + ret = btrfs_remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + } ret = remove_block_group_item(trans, path, block_group); if (ret < 0) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f0f72850fab2..8d4db3d57cf7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2756,6 +2756,9 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) + return 0; + if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, true); From efcab3176eb28427177c6319e128b41efd03ffdd Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:05 +0000 Subject: [PATCH 055/137] btrfs: don't add metadata items for the remap tree to the extent tree There is the following potential problem with the remap tree and delayed refs: * Remapped extent freed in a delayed ref, which removes an entry from the remap tree * Remap tree now small enough to fit in a single leaf * Corruption as we now have a level-0 block with a level-1 metadata item in the extent tree One solution to this would be to rework the remap tree code so that it operates via delayed refs. But as we're hoping to remove cow-only metadata items in the future anyway, change things so that the remap tree doesn't have any entries in the extent tree. This also has the benefit of reducing write amplification. We also make it so that the clear_cache mount option is a no-op, as with the extent tree v2, as the free-space tree can no longer be recreated from the extent tree. Finally disable relocating the remap tree itself, which is added back in a later patch. As it is we would get corruption as the traditional relocation method walks the extent tree, and we're removing its metadata items. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 31 ++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 3 +++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 922e69038d81..cd46b9d85880 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2985,6 +2985,8 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) btrfs_warn(fs_info, "'clear_cache' option is ignored with extent tree v2"); + else if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + btrfs_warn(fs_info, "'clear_cache' option is ignored with remap tree"); else rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e3877a42ee6..48a453fa3063 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1553,6 +1553,28 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, BTRFS_QGROUP_RSV_DATA); } +static int drop_remap_tree_ref(struct btrfs_trans_handle *trans, + const struct btrfs_delayed_ref_node *node) +{ + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; + int ret; + + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, const struct btrfs_delayed_ref_node *node, @@ -1747,7 +1769,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, extent_op); + if (node->ref_root == BTRFS_REMAP_TREE_OBJECTID) + ret = drop_remap_tree_ref(trans, node); + else + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -4890,6 +4915,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + if (unlikely(node->ref_root == BTRFS_REMAP_TREE_OBJECTID)) + goto skip; + extent_key.objectid = node->bytenr; if (skinny_metadata) { /* The owner of a tree block is the level. */ @@ -4942,6 +4970,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_free_path(path); +skip: return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2c9b55f66cc3..6280a1a4c407 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3972,6 +3972,9 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) + return false; + /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { From bf8ff4b9f0aa3f9e49779c8d3edbdc11caa5cd05 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:06 +0000 Subject: [PATCH 056/137] btrfs: rename struct btrfs_block_group field commit_used to last_used Rename the field commit_used in struct btrfs_block_group to last_used, for clarity and consistency with the similar fields we're about to add. It's not obvious that commit_flags means "flags as of the last commit" rather than "flags related to a commit". Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 24 ++++++++++++------------ fs/btrfs/block-group.h | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a2f95ac5a8d0..5709acc84297 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2387,7 +2387,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); - cache->commit_used = cache->used; + cache->last_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); @@ -2666,7 +2666,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group_item bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; - u64 old_commit_used; + u64 old_last_used; int ret; spin_lock(&block_group->lock); @@ -2674,8 +2674,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, btrfs_set_stack_block_group_chunk_objectid(&bgi, block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); - old_commit_used = block_group->commit_used; - block_group->commit_used = block_group->used; + old_last_used = block_group->last_used; + block_group->last_used = block_group->used; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; @@ -2684,7 +2684,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); if (ret < 0) { spin_lock(&block_group->lock); - block_group->commit_used = old_commit_used; + block_group->last_used = old_last_used; spin_unlock(&block_group->lock); } @@ -3134,7 +3134,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; - u64 old_commit_used; + u64 old_last_used; u64 used; /* @@ -3144,14 +3144,14 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, * may be changed. */ spin_lock(&cache->lock); - old_commit_used = cache->commit_used; + old_last_used = cache->last_used; used = cache->used; /* No change in used bytes, can safely skip it. */ - if (cache->commit_used == used) { + if (cache->last_used == used) { spin_unlock(&cache->lock); return 0; } - cache->commit_used = used; + cache->last_used = used; spin_unlock(&cache->lock); key.objectid = cache->start; @@ -3175,17 +3175,17 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, fail: btrfs_release_path(path); /* - * We didn't update the block group item, need to revert commit_used + * We didn't update the block group item, need to revert last_used * unless the block group item didn't exist yet - this is to prevent a * race with a concurrent insertion of the block group item, with * insert_block_group_item(), that happened just after we attempted to - * update. In that case we would reset commit_used to 0 just after the + * update. In that case we would reset last_used to 0 just after the * insertion set it to a value greater than 0 - if the block group later * becomes with 0 used bytes, we would incorrectly skip its update. */ if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); - cache->commit_used = old_commit_used; + cache->last_used = old_last_used; spin_unlock(&cache->lock); } return ret; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index cd2d53d5b315..b0fb85a36d97 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -132,10 +132,10 @@ struct btrfs_block_group { /* * The last committed used bytes of this block group, if the above @used - * is still the same as @commit_used, we don't need to update block + * is still the same as @last_used, we don't need to update block * group item of this block group. */ - u64 commit_used; + u64 last_used; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. From 7977011460cffc6f5a0cd830584c832c4aa07076 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:07 +0000 Subject: [PATCH 057/137] btrfs: add extended version of struct block_group_item Add a struct btrfs_block_group_item_v2, which is used in the block group tree if the remap-tree incompat flag is set. This adds two new fields to the block group item: `remap_bytes` and `identity_remap_count`. `remap_bytes` records the amount of data that's physically within this block group, but nominally in another, remapped block group. This is necessary because this data will need to be moved first if this block group is itself relocated. If `remap_bytes` > 0, this is an indicator to the relocation thread that it will need to search the remap-tree for backrefs. A block group must also have `remap_bytes` == 0 before it can be dropped. `identity_remap_count` records how many identity remap items are located in the remap tree for this block group. When relocation is begun for this block group, this is set to the number of holes in the free-space tree for this range. As identity remaps are converted into actual remaps by the relocation process, this number is decreased. Once it reaches 0, either because of relocation or because extents have been deleted, the block group has been fully remapped and its chunk's device extents are removed. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 20 +++++++ fs/btrfs/block-group.c | 92 ++++++++++++++++++++++++--------- fs/btrfs/block-group.h | 10 +++- fs/btrfs/discard.c | 2 +- fs/btrfs/tree-checker.c | 10 +++- include/uapi/linux/btrfs_tree.h | 8 +++ 6 files changed, 114 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 09cdd6bfddf5..9797f9e8d4e5 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -240,6 +240,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_block_group_item_v2 */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2, + used, 64); +BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid, + struct btrfs_block_group_item_v2, chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags, + struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes, + struct btrfs_block_group_item_v2, remap_bytes, 64); +BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2, + remap_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count, + struct btrfs_block_group_item_v2, identity_remap_count, 32); +BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2, + identity_remap_count, 32); + /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5709acc84297..a1ab513fa8ea 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2371,7 +2371,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) } static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_block_group_item *bgi, + struct btrfs_block_group_item_v2 *bgi, const struct btrfs_key *key, int need_clear) { @@ -2386,11 +2386,15 @@ static int read_one_block_group(struct btrfs_fs_info *info, return -ENOMEM; cache->length = key->offset; - cache->used = btrfs_stack_block_group_used(bgi); + cache->used = btrfs_stack_block_group_v2_used(bgi); cache->last_used = cache->used; - cache->flags = btrfs_stack_block_group_flags(bgi); - cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->flags = btrfs_stack_block_group_v2_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); + cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi); + cache->last_remap_bytes = cache->remap_bytes; + cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi); + cache->last_identity_remap_count = cache->identity_remap_count; btrfs_set_free_space_tree_thresholds(cache); @@ -2455,7 +2459,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, } else if (cache->length == cache->used) { cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); - } else if (cache->used == 0) { + } else if (cache->used == 0 && cache->remap_bytes == 0) { cache->cached = BTRFS_CACHE_FINISHED; ret = btrfs_add_new_free_space(cache, cache->start, cache->start + cache->length, NULL); @@ -2475,7 +2479,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { - if (cache->used == 0) { + if (cache->used == 0 && cache->remap_bytes == 0) { ASSERT(list_empty(&cache->bg_list)); if (btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_discard_queue_work(&info->discard_ctl, cache); @@ -2579,9 +2583,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct extent_buffer *leaf; int slot; + size_t size; ret = find_first_block_group(info, path, &key); if (ret > 0) @@ -2592,8 +2597,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) leaf = path->nodes[0]; slot = path->slots[0]; + if (btrfs_fs_incompat(info, REMAP_TREE)) { + size = sizeof(struct btrfs_block_group_item_v2); + } else { + size = sizeof(struct btrfs_block_group_item); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0); + } + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); + size); btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_release_path(path); @@ -2663,25 +2676,34 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; u64 old_last_used; + size_t size; int ret; spin_lock(&block_group->lock); - btrfs_set_stack_block_group_used(&bgi, block_group->used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - block_group->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_used(&bgi, block_group->used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags); + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count); old_last_used = block_group->last_used; block_group->last_used = block_group->used; + block_group->last_remap_bytes = block_group->remap_bytes; + block_group->last_identity_remap_count = block_group->identity_remap_count; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); - ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + size = sizeof(struct btrfs_block_group_item_v2); + else + size = sizeof(struct btrfs_block_group_item); + + ret = btrfs_insert_item(trans, root, &key, &bgi, size); if (ret < 0) { spin_lock(&block_group->lock); block_group->last_used = old_last_used; @@ -3132,10 +3154,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_root *root = btrfs_block_group_root(fs_info); unsigned long bi; struct extent_buffer *leaf; - struct btrfs_block_group_item bgi; + struct btrfs_block_group_item_v2 bgi; struct btrfs_key key; - u64 old_last_used; - u64 used; + u64 old_last_used, old_last_remap_bytes; + u32 old_last_identity_remap_count; + u64 used, remap_bytes; + u32 identity_remap_count; /* * Block group items update can be triggered out of commit transaction @@ -3145,13 +3169,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, */ spin_lock(&cache->lock); old_last_used = cache->last_used; + old_last_remap_bytes = cache->last_remap_bytes; + old_last_identity_remap_count = cache->last_identity_remap_count; used = cache->used; - /* No change in used bytes, can safely skip it. */ - if (cache->last_used == used) { + remap_bytes = cache->remap_bytes; + identity_remap_count = cache->identity_remap_count; + /* No change in values, can safely skip it. */ + if (cache->last_used == used && + cache->last_remap_bytes == remap_bytes && + cache->last_identity_remap_count == identity_remap_count) { spin_unlock(&cache->lock); return 0; } cache->last_used = used; + cache->last_remap_bytes = remap_bytes; + cache->last_identity_remap_count = identity_remap_count; spin_unlock(&cache->lock); key.objectid = cache->start; @@ -3167,11 +3199,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, used); - btrfs_set_stack_block_group_chunk_objectid(&bgi, - cache->global_root_id); - btrfs_set_stack_block_group_flags(&bgi, cache->flags); - write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); + btrfs_set_stack_block_group_v2_used(&bgi, used); + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id); + btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags); + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes); + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, + cache->identity_remap_count); + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item_v2)); + } else { + write_extent_buffer(leaf, &bgi, bi, + sizeof(struct btrfs_block_group_item)); + } + fail: btrfs_release_path(path); /* @@ -3186,6 +3228,8 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->last_used = old_last_used; + cache->last_remap_bytes = old_last_remap_bytes; + cache->last_identity_remap_count = old_last_identity_remap_count; spin_unlock(&cache->lock); } return ret; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index b0fb85a36d97..ecabb1a9fc0e 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -129,6 +129,8 @@ struct btrfs_block_group { u64 flags; u64 cache_generation; u64 global_root_id; + u64 remap_bytes; + u32 identity_remap_count; /* * The last committed used bytes of this block group, if the above @used @@ -136,6 +138,11 @@ struct btrfs_block_group { * group item of this block group. */ u64 last_used; + /* The last committed remap_bytes value of this block group. */ + u64 last_remap_bytes; + /* The last commited identity_remap_count value of this block group. */ + u32 last_identity_remap_count; + /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. @@ -282,7 +289,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) { lockdep_assert_held(&bg->lock); - return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 || + bg->remap_bytes > 0); } static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 89fe85778115..ee5f5b2788e1 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -373,7 +373,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; - if (block_group->used == 0) + if (block_group->used == 0 && block_group->remap_bytes == 0) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ead2e1e2a0bb..452394b34d01 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf, u64 chunk_objectid; u64 flags; u64 type; + size_t exp_size; /* * Here we don't really care about alignment since extent allocator can @@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (unlikely(item_size != sizeof(bgi))) { + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) + exp_size = sizeof(struct btrfs_block_group_item_v2); + else + exp_size = sizeof(struct btrfs_block_group_item); + + if (unlikely(item_size != exp_size)) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", - item_size, sizeof(bgi)); + item_size, exp_size); return -EUCLEAN; } diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 76578426671c..86820a9644e8 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1229,6 +1229,14 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_block_group_item_v2 { + __le64 used; + __le64 chunk_objectid; + __le64 flags; + __le64 remap_bytes; + __le32 identity_remap_count; +} __attribute__ ((__packed__)); + struct btrfs_free_space_info { __le32 extent_count; __le32 flags; From 8620da16fb6be1fd9906374fa1c763a10c6918df Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:08 +0000 Subject: [PATCH 058/137] btrfs: allow mounting filesystems with remap-tree incompat flag If we encounter a filesystem with the remap-tree incompat flag set, validate its compatibility with the other flags, and load the remap tree using the values that have been added to the superblock. The remap-tree feature depends on the free-space-tree, but no-holes and block-group-tree have been made dependencies to reduce the testing matrix. Similarly I'm not aware of any reason why mixed-bg and zoned would be incompatible with remap-tree, but this is blocked for the time being until it can be fully tested. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 2 + fs/btrfs/accessors.h | 6 ++ fs/btrfs/disk-io.c | 105 ++++++++++++++++++++++++++++---- fs/btrfs/extent-tree.c | 2 + fs/btrfs/fs.h | 4 +- fs/btrfs/transaction.c | 7 +++ include/uapi/linux/btrfs_tree.h | 5 +- 7 files changed, 116 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 423122786a93..ede184b6eda1 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -116,4 +116,6 @@ config BTRFS_EXPERIMENTAL - asynchronous checksum generation for data writes + - remap-tree - logical address remapping tree + If unsure, say N. diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 9797f9e8d4e5..8938357fcb40 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -883,6 +883,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, nr_global_roots, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block, + remap_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block, + remap_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block, + remap_root_level, 8); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cd46b9d85880..c69734c74c26 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1136,6 +1136,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(btrfs_global_root(fs_info, &key)); case BTRFS_RAID_STRIPE_TREE_OBJECTID: return btrfs_grab_root(fs_info->stripe_root); + case BTRFS_REMAP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->remap_root); default: return NULL; } @@ -1226,6 +1228,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); btrfs_put_root(fs_info->stripe_root); + btrfs_put_root(fs_info->remap_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1778,6 +1781,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); free_root_extent_buffers(info->stripe_root); + free_root_extent_buffers(info->remap_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2191,21 +2195,44 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) goto out; - /* - * This tree can share blocks with some other fs tree during relocation - * and we need a proper setup by btrfs_get_fs_root - */ - root = btrfs_get_fs_root(tree_root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID, true); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - ret = PTR_ERR(root); - goto out; + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* The remap_root has already been loaded in load_important_roots(). */ + root = fs_info->remap_root; + + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + + root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + /* Check that data reloc tree doesn't also exist. */ + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root = btrfs_read_tree_root(fs_info->tree_root, &location); + if (!IS_ERR(root)) { + btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled"); + btrfs_put_root(root); + return -EIO; + } else if (PTR_ERR(root) != -ENOENT) { + btrfs_warn(fs_info, "error %ld when checking for data reloc tree", + PTR_ERR(root)); } } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; + /* + * This tree can share blocks with some other fs tree during + * relocation and we need a proper setup by btrfs_get_fs_root(). + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + } } location.objectid = BTRFS_QUOTA_TREE_OBJECTID; @@ -2445,6 +2472,35 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + /* + * Reduce test matrix for remap tree by requiring block-group-tree + * and no-holes. Free-space-tree is a hard requirement. + */ + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + btrfs_err(fs_info, +"remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, ZONED)) { + btrfs_err(fs_info, "remap-tree not supported with zoned devices"); + ret = -EINVAL; + } + + if (sectorsize > PAGE_SIZE) { + btrfs_err(fs_info, "remap-tree not supported when block size > page size"); + ret = -EINVAL; + } + } + /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later @@ -2603,6 +2659,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "couldn't read tree root"); return ret; } + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + bytenr = btrfs_super_remap_root(sb); + gen = btrfs_super_remap_root_generation(sb); + level = btrfs_super_remap_root_level(sb); + ret = load_super_root(fs_info->remap_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read remap root"); + return ret; + } + } + return 0; } @@ -3231,6 +3299,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; struct btrfs_root *chunk_root; + struct btrfs_root *remap_root; int ret; int level; @@ -3365,6 +3434,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret < 0) goto fail_alloc; + if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { + remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID, + GFP_KERNEL); + fs_info->remap_root = remap_root; + if (!remap_root) { + ret = -ENOMEM; + goto fail_alloc; + } + } + /* * At this point our mount options are validated, if we set ->max_inline * to something non-standard make sure we truncate it to sectorsize. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 48a453fa3063..ce4bda1f37ad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2593,6 +2593,8 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) flags = BTRFS_BLOCK_GROUP_DATA; else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; + else if (root == fs_info->remap_root) + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; else flags = BTRFS_BLOCK_GROUP_METADATA; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 195428ecfd75..13b0aa0b9da9 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -315,7 +315,8 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \ + BTRFS_FEATURE_INCOMPAT_REMAP_TREE) #else @@ -475,6 +476,7 @@ struct btrfs_fs_info { struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; struct btrfs_root *stripe_root; + struct btrfs_root *remap_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e2f993b1783f..f4cc9e1a1b93 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1967,6 +1967,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; + + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + root_item = &fs_info->remap_root->root_item; + super->remap_root = root_item->bytenr; + super->remap_root_generation = root_item->generation; + super->remap_root_level = root_item->level; + } } int btrfs_transaction_blocked(struct btrfs_fs_info *info) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 86820a9644e8..f7843e6bb978 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -721,9 +721,12 @@ struct btrfs_super_block { __u8 metadata_uuid[BTRFS_FSID_SIZE]; __u64 nr_global_roots; + __le64 remap_root; + __le64 remap_root_generation; + __u8 remap_root_level; /* Future expansion */ - __le64 reserved[27]; + __u8 reserved[199]; __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; From 18ba649928711539dd124b4bf7682696b3f2e4a8 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:09 +0000 Subject: [PATCH 059/137] btrfs: redirect I/O for remapped block groups Change btrfs_map_block() so that if the block group has the REMAPPED flag set, we call btrfs_translate_remap() to obtain a new address. btrfs_translate_remap() searches the remap tree for a range corresponding to the logical address passed to btrfs_map_block(). If it is within an identity remap, this part of the block group hasn't yet been relocated, and so we use the existing address. If it is within an actual remap, we subtract the start of the remap range and add the address of its destination, contained in the item's payload. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 50 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/relocation.h | 1 + fs/btrfs/volumes.c | 18 ++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 310b7d817a27..6de508323dbd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3859,6 +3859,56 @@ static const char *stage_to_string(enum reloc_stage stage) return "unknown"; } +int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length) +{ + int ret; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct btrfs_remap_item *remap; + BTRFS_PATH_AUTO_FREE(path); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = *logical; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + if (path->slots[0] == 0) + return -ENOENT; + + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.type != BTRFS_REMAP_KEY && + found_key.type != BTRFS_IDENTITY_REMAP_KEY) { + return -ENOENT; + } + + if (found_key.objectid > *logical || + found_key.objectid + found_key.offset <= *logical) { + return -ENOENT; + } + + if (*logical + *length > found_key.objectid + found_key.offset) + *length = found_key.objectid + found_key.offset - *logical; + + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) + return 0; + + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + *logical += btrfs_remap_address(leaf, remap) - found_key.objectid; + + return 0; +} + /* * function to relocate all extents in a block group. */ diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 5c36b3f84b57..c0ee26004fc1 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -31,5 +31,6 @@ int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); +int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6280a1a4c407..2a4bda452d85 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6586,6 +6586,24 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (IS_ERR(map)) return PTR_ERR(map); + if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) { + u64 new_logical = logical; + + ret = btrfs_translate_remap(fs_info, &new_logical, length); + if (ret) + return ret; + + if (new_logical != logical) { + btrfs_free_chunk_map(map); + + map = btrfs_get_chunk_map(fs_info, new_logical, *length); + if (IS_ERR(map)) + return PTR_ERR(map); + + logical = new_logical; + } + } + num_copies = btrfs_chunk_map_num_copies(map); if (io_geom.mirror_num > num_copies) return -EINVAL; From 979e1dc3d69e4c825eec05d05d9567b251f6ec23 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:10 +0000 Subject: [PATCH 060/137] btrfs: handle deletions from remapped block group Handle the case where we free an extent from a block group that has the REMAPPED flag set. Because the remap tree is orthogonal to the extent tree, for data this may be within any number of identity remaps or actual remaps. If we're freeing a metadata node, this will be wholly inside one or the other. btrfs_remove_extent_from_remap_tree() searches the remap tree for the remaps that cover the range in question, then calls remove_range_from_remap_tree() for each one, to punch a hole in the remap and adjust the free-space tree. For an identity remap, remove_range_from_remap_tree() will adjust the block group's `identity_remap_count` if this changes. If it reaches zero we mark the block group as fully remapped. For an identity remap, remove_range_from_remap_tree() will adjust the block group's `identity_remap_count` if this changes. If it reaches zero we mark the block group as fully remapped. Fully remapped block groups have their chunk stripes removed and their device extents freed, which makes the disk space available again to the chunk allocator. This happens asynchronously: in the cleaner thread for sync discard and nodiscard, and (in a later patch) in the discard worker for async discard. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 87 ++++++--- fs/btrfs/block-group.h | 4 + fs/btrfs/disk-io.c | 6 + fs/btrfs/extent-tree.c | 94 ++++++++- fs/btrfs/extent-tree.h | 2 + fs/btrfs/fs.h | 4 +- fs/btrfs/relocation.c | 429 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/relocation.h | 5 + fs/btrfs/volumes.c | 50 ++--- fs/btrfs/volumes.h | 3 + 10 files changed, 629 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a1ab513fa8ea..3b8a750d8519 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1067,6 +1067,24 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, return btrfs_del_item(trans, root, path); } +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg) +{ + int factor = btrfs_bg_type_to_factor(bg->flags); + + spin_lock(&bg->space_info->lock); + if (btrfs_test_opt(bg->fs_info, ENOSPC_DEBUG)) { + WARN_ON(bg->space_info->total_bytes < bg->length); + WARN_ON(bg->space_info->bytes_readonly < bg->length - bg->zone_unusable); + WARN_ON(bg->space_info->bytes_zone_unusable < bg->zone_unusable); + WARN_ON(bg->space_info->disk_total < bg->length * factor); + } + bg->space_info->total_bytes -= bg->length; + bg->space_info->bytes_readonly -= (bg->length - bg->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(bg->space_info, -bg->zone_unusable); + bg->space_info->disk_total -= bg->length * factor; + spin_unlock(&bg->space_info->lock); +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { @@ -1078,7 +1096,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct kobject *kobj = NULL; int ret; int index; - int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_map; bool remove_rsv = false; @@ -1087,7 +1104,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (!block_group) return -ENOENT; - BUG_ON(!block_group->ro); + BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)); trace_btrfs_remove_block_group(block_group); /* @@ -1099,7 +1116,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->length); index = btrfs_bg_flags_to_raid_index(block_group->flags); - factor = btrfs_bg_type_to_factor(block_group->flags); /* make sure this block group isn't part of an allocation cluster */ cluster = &fs_info->data_alloc_cluster; @@ -1223,26 +1239,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - WARN_ON(block_group->space_info->total_bytes - < block_group->length); - WARN_ON(block_group->space_info->bytes_readonly - < block_group->length - block_group->zone_unusable); - WARN_ON(block_group->space_info->bytes_zone_unusable - < block_group->zone_unusable); - WARN_ON(block_group->space_info->disk_total - < block_group->length * factor); - } - block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= - (block_group->length - block_group->zone_unusable); - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, - -block_group->zone_unusable); - block_group->space_info->disk_total -= block_group->length * factor; - spin_unlock(&block_group->space_info->lock); + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) + btrfs_remove_bg_from_sinfo(block_group); + /* * Remove the free space for the block group from the free space tree * and the block group's item from the extent tree before marking the @@ -1575,8 +1576,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (btrfs_is_block_group_used(block_group) || block_group->ro || - list_is_singular(&block_group->list)) { + if (btrfs_is_block_group_used(block_group) || + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) || + list_is_singular(&block_group->list) || + test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &block_group->runtime_flags)) { /* * We want to bail if we made new allocations or have * outstanding allocations in this block group. We do @@ -1617,9 +1620,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if ((space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) || - has_unwritten_metadata(block_group)) { + if (((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) && + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1784,6 +1788,9 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) btrfs_get_block_group(bg); trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED && + bg->identity_remap_count == 0) { + /* Leave fully remapped block groups on the fully_remapped_bgs list. */ } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */ trace_btrfs_add_unused_block_group(bg); @@ -4581,6 +4588,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); } + + while (!list_empty(&info->fully_remapped_bgs)) { + block_group = list_first_entry(&info->fully_remapped_bgs, + struct btrfs_block_group, bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } spin_unlock(&info->unused_bgs_lock); spin_lock(&info->zone_active_bgs_lock); @@ -4768,3 +4782,24 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) return false; return true; } + +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, + struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + /* + * The block group might already be on the unused_bgs list, remove it + * if it is. It'll get readded after the async discard worker finishes, + * or in btrfs_handle_fully_remapped_bgs() if we're not using async + * discard. + */ + if (!list_empty(&bg->bg_list)) + list_del(&bg->bg_list); + else + btrfs_get_block_group(bg); + + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + spin_unlock(&fs_info->unused_bgs_lock); +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index ecabb1a9fc0e..f5c15c7f6cc7 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -92,6 +92,7 @@ enum btrfs_block_group_flags { * transaction. */ BLOCK_GROUP_FLAG_NEW, + BLOCK_GROUP_FLAG_FULLY_REMAPPED, }; enum btrfs_caching_type { @@ -340,6 +341,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); @@ -411,5 +413,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, + struct btrfs_trans_handle *trans); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c69734c74c26..627282613eee 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1473,6 +1473,10 @@ static int cleaner_kthread(void *arg) */ btrfs_run_defrag_inodes(fs_info); + if (btrfs_fs_incompat(fs_info, REMAP_TREE) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_handle_fully_remapped_bgs(fs_info); + /* * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation @@ -2807,6 +2811,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs); INIT_LIST_HEAD(&fs_info->zone_active_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); @@ -2862,6 +2867,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->remap_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ce4bda1f37ad..5e3e9f18b263 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -41,6 +41,7 @@ #include "tree-checker.h" #include "raid-stripe-tree.h" #include "delayed-inode.h" +#include "relocation.h" #undef SCRAMBLE_DELAYED_REFS @@ -2848,6 +2849,73 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, return 0; } +/* + * Complete the remapping of a block group by removing its chunk stripes and + * device extents, and adding it to the unused list if there's no longer any + * extents nominally within it. + */ +int btrfs_complete_bg_remapping(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_chunk_map *map; + int ret; + + map = btrfs_get_chunk_map(fs_info, bg->start, 1); + if (IS_ERR(map)) + return PTR_ERR(map); + + ret = btrfs_last_identity_remap_gone(map, bg); + if (ret) { + btrfs_free_chunk_map(map); + return ret; + } + + /* + * Set num_stripes to 0, so that btrfs_remove_dev_extents() won't run a + * second time. + */ + map->num_stripes = 0; + + btrfs_free_chunk_map(map); + + if (bg->used == 0) { + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&bg->bg_list)) { + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + } + spin_unlock(&fs_info->unused_bgs_lock); + + btrfs_mark_bg_unused(bg); + } + + return 0; +} + +void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *bg; + int ret; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->fully_remapped_bgs)) { + bg = list_first_entry(&fs_info->fully_remapped_bgs, + struct btrfs_block_group, bg_list); + list_del_init(&bg->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + + ret = btrfs_complete_bg_remapping(bg); + if (ret) { + btrfs_put_block_group(bg); + return; + } + + btrfs_put_block_group(bg); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -3000,11 +3068,22 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, } static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, struct btrfs_squota_delta *delta) + u64 bytenr, struct btrfs_squota_delta *delta, + struct btrfs_path *path) { int ret; + bool remapped = false; u64 num_bytes = delta->num_bytes; + /* Returns 1 on success and 0 on no-op. */ + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr, num_bytes); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } else if (ret == 1) { + remapped = true; + } + if (delta->is_data) { struct btrfs_root *csum_root; @@ -3028,10 +3107,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); - if (unlikely(ret)) { - btrfs_abort_transaction(trans, ret); - return ret; + /* If remapped, FST has already been taken care of in remove_range_from_remap_tree(). */ + if (!remapped) { + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } } ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); @@ -3390,7 +3472,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, &delta); + ret = do_free_extent_accounting(trans, bytenr, &delta, path); } btrfs_release_path(path); diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 71bb8109c969..d7b6aeb63656 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -163,5 +163,7 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); +void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info); +int btrfs_complete_bg_remapping(struct btrfs_block_group *bg); #endif diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 13b0aa0b9da9..5bbc993a66e1 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -587,6 +587,7 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; + struct mutex remap_mutex; /* * This is taken to make sure we don't set block groups ro after the @@ -840,10 +841,11 @@ struct btrfs_fs_info { struct list_head reclaim_bgs; int bg_reclaim_threshold; - /* Protects the lists unused_bgs and reclaim_bgs. */ + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */ spinlock_t unused_bgs_lock; /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; + struct list_head fully_remapped_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ struct mutex reclaim_bgs_lock; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6de508323dbd..e0558b2cd0b4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -37,6 +37,7 @@ #include "super.h" #include "tree-checker.h" #include "raid-stripe-tree.h" +#include "free-space-tree.h" /* * Relocation overview @@ -3859,6 +3860,177 @@ static const char *stage_to_string(enum reloc_stage stage) return "unknown"; } +static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg, s64 diff) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + bool bg_already_dirty = true; + bool mark_unused = false; + + spin_lock(&bg->lock); + bg->remap_bytes += diff; + if (bg->used == 0 && bg->remap_bytes == 0) + mark_unused = true; + spin_unlock(&bg->lock); + + if (mark_unused) + btrfs_mark_bg_unused(bg); + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&bg->dirty_list)) { + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(bg); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); +} + +static int remove_chunk_stripes(struct btrfs_trans_handle *trans, + struct btrfs_chunk_map *chunk_map, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + int ret; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_map->start; + + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1); + if (ret) { + if (ret == 1) { + btrfs_release_path(path); + ret = -ENOENT; + } + btrfs_trans_release_chunk_metadata(trans); + return ret; + } + + leaf = path->nodes[0]; + + chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); + btrfs_set_chunk_num_stripes(leaf, chunk, 0); + btrfs_set_chunk_sub_stripes(leaf, chunk, 0); + + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe), 1); + + btrfs_mark_buffer_dirty(trans, leaf); + + btrfs_release_path(path); + btrfs_trans_release_chunk_metadata(trans); + + return 0; +} + +int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_trans_handle *trans; + int ret; + unsigned int num_items; + BTRFS_PATH_AUTO_FREE(path); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * One item for each entry we're removing in the dev extents tree, and + * another for each device. DUP chunks are all on one device, + * everything else has one device per stripe. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DUP) + num_items = chunk_map->num_stripes + 1; + else + num_items = 2 * chunk_map->num_stripes; + + trans = btrfs_start_transaction_fallback_global_rsv(fs_info->tree_root, num_items); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_remove_dev_extents(trans, chunk_map); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + mutex_lock(&trans->fs_info->chunk_mutex); + for (unsigned int i = 0; i < chunk_map->num_stripes; i++) { + ret = btrfs_update_device(trans, chunk_map->stripes[i].dev); + if (unlikely(ret)) { + mutex_unlock(&trans->fs_info->chunk_mutex); + btrfs_abort_transaction(trans, ret); + return ret; + } + } + mutex_unlock(&trans->fs_info->chunk_mutex); + + write_lock(&trans->fs_info->mapping_tree_lock); + btrfs_chunk_map_device_clear_bits(chunk_map, CHUNK_ALLOCATED); + write_unlock(&trans->fs_info->mapping_tree_lock); + + btrfs_remove_bg_from_sinfo(bg); + + ret = remove_chunk_stripes(trans, chunk_map, path); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + return 0; +} + +static void adjust_identity_remap_count(struct btrfs_trans_handle *trans, + struct btrfs_block_group *bg, int delta) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + bool bg_already_dirty = true; + bool mark_fully_remapped = false; + + WARN_ON(delta < 0 && -delta > bg->identity_remap_count); + + spin_lock(&bg->lock); + + bg->identity_remap_count += delta; + + if (bg->identity_remap_count == 0 && + !test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags)) { + set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags); + mark_fully_remapped = true; + } + + spin_unlock(&bg->lock); + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&bg->dirty_list)) { + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(bg); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); + + if (mark_fully_remapped) + btrfs_mark_bg_fully_remapped(bg, trans); +} + int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length) { int ret; @@ -4463,3 +4635,260 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) logical = fs_info->reloc_ctl->block_group->start; return logical; } + +static int insert_remap_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, + u64 old_addr, u64 length, u64 new_addr) +{ + int ret; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key; + struct btrfs_remap_item remap = { 0 }; + + if (old_addr == new_addr) { + /* Add new identity remap item. */ + key.objectid = old_addr; + key.type = BTRFS_IDENTITY_REMAP_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, + &key, 0); + if (ret) + return ret; + } else { + /* Add new remap item. */ + key.objectid = old_addr; + key.type = BTRFS_REMAP_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, + path, &key, sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + btrfs_set_stack_remap_address(&remap, new_addr); + + write_extent_buffer(path->nodes[0], &remap, + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), + sizeof(struct btrfs_remap_item)); + + btrfs_release_path(path); + + /* Add new backref item. */ + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, + path, &key, + sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + btrfs_set_stack_remap_address(&remap, old_addr); + + write_extent_buffer(path->nodes[0], &remap, + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), + sizeof(struct btrfs_remap_item)); + } + + btrfs_release_path(path); + + return 0; +} + +/* + * Punch a hole in the remap item or identity remap item pointed to by path, + * for the range [hole_start, hole_start + hole_length). + */ +static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *bg, + u64 hole_start, u64 hole_length) +{ + int ret; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + u64 hole_end, new_addr, remap_start, remap_length, remap_end; + u64 overlap_length; + bool is_identity_remap; + int identity_count_delta = 0; + + hole_end = hole_start + hole_length; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + is_identity_remap = (key.type == BTRFS_IDENTITY_REMAP_KEY); + + remap_start = key.objectid; + remap_length = key.offset; + remap_end = remap_start + remap_length; + + if (is_identity_remap) { + new_addr = remap_start; + } else { + struct btrfs_remap_item *remap_ptr; + + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + new_addr = btrfs_remap_address(leaf, remap_ptr); + } + + /* Delete old item. */ + ret = btrfs_del_item(trans, fs_info->remap_root, path); + btrfs_release_path(path); + if (ret) + return ret; + + if (is_identity_remap) { + identity_count_delta = -1; + } else { + /* Remove backref. */ + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = remap_length; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (ret) { + if (ret == 1) { + btrfs_release_path(path); + ret = -ENOENT; + } + return ret; + } + + ret = btrfs_del_item(trans, fs_info->remap_root, path); + + btrfs_release_path(path); + + if (ret) + return ret; + } + + /* If hole_start > remap_start, re-add the start of the remap item. */ + if (hole_start > remap_start) { + ret = insert_remap_item(trans, path, remap_start, + hole_start - remap_start, new_addr); + if (ret) + return ret; + + if (is_identity_remap) + identity_count_delta++; + } + + /* If hole_end < remap_end, re-add the end of the remap item. */ + if (hole_end < remap_end) { + ret = insert_remap_item(trans, path, hole_end, + remap_end - hole_end, + hole_end - remap_start + new_addr); + if (ret) + return ret; + + if (is_identity_remap) + identity_count_delta++; + } + + if (identity_count_delta != 0) + adjust_identity_remap_count(trans, bg, identity_count_delta); + + overlap_length = min_t(u64, hole_end, remap_end) - + max_t(u64, hole_start, remap_start); + + if (!is_identity_remap) { + struct btrfs_block_group *dest_bg; + + dest_bg = btrfs_lookup_block_group(fs_info, new_addr); + adjust_block_group_remap_bytes(trans, dest_bg, -overlap_length); + btrfs_put_block_group(dest_bg); + ret = btrfs_add_to_free_space_tree(trans, + hole_start - remap_start + new_addr, + overlap_length); + if (ret) + return ret; + } + + ret = overlap_length; + + return ret; +} + +/* + * Return 1 if remove_range_from_remap_tree() has been called successfully, + * 0 if block group wasn't remapped, and a negative number on error. + */ +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct btrfs_block_group *bg; + int ret, length; + + if (!(btrfs_super_incompat_flags(fs_info->super_copy) & + BTRFS_FEATURE_INCOMPAT_REMAP_TREE)) + return 0; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return 0; + + mutex_lock(&fs_info->remap_mutex); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { + mutex_unlock(&fs_info->remap_mutex); + btrfs_put_block_group(bg); + return 0; + } + + do { + key.objectid = bytenr; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (ret < 0) + goto end; + + leaf = path->nodes[0]; + if (path->slots[0] == 0) { + ret = -ENOENT; + goto end; + } + + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY && + found_key.type != BTRFS_REMAP_KEY) { + ret = -ENOENT; + goto end; + } + + if (bytenr < found_key.objectid || + bytenr >= found_key.objectid + found_key.offset) { + ret = -ENOENT; + goto end; + } + + length = remove_range_from_remap_tree(trans, path, bg, bytenr, num_bytes); + if (length < 0) { + ret = length; + goto end; + } + + bytenr += length; + num_bytes -= length; + } while (num_bytes > 0); + + ret = 1; + +end: + mutex_unlock(&fs_info->remap_mutex); + + btrfs_put_block_group(bg); + btrfs_release_path(path); + + return ret; +} diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index c0ee26004fc1..3afb6f85b722 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -32,5 +32,10 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length); +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes); +int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, + struct btrfs_block_group *bg); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2a4bda452d85..4884c7b62c61 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2925,8 +2925,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; } -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device) +int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; BTRFS_PATH_AUTO_FREE(path); @@ -3224,25 +3223,12 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans, return btrfs_free_chunk(trans, chunk_offset); } -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_chunk_map *map; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 dev_extent_len = 0; int i, ret = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(map)) { - /* - * This is a logic error, but we don't want to just rely on the - * user having built with ASSERT enabled, so if ASSERT doesn't - * do anything we still error out. - */ - DEBUG_WARN("errr %ld reading chunk map at offset %llu", - PTR_ERR(map), chunk_offset); - return PTR_ERR(map); - } /* * First delete the device extent items from the devices btree. @@ -3263,7 +3249,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (unlikely(ret)) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (device->bytes_used > 0) { @@ -3283,6 +3269,26 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } mutex_unlock(&fs_devices->device_list_mutex); + return 0; +} + +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_chunk_map *map; + int ret; + + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) { + DEBUG_WARN("errr %ld reading chunk map at offset %llu", + PTR_ERR(map), chunk_offset); + return PTR_ERR(map); + } + + ret = btrfs_remove_dev_extents(trans, map); + if (ret) + goto out; + /* * We acquire fs_info->chunk_mutex for 2 reasons: * @@ -5419,7 +5425,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int } } -static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) { for (int i = 0; i < map->num_stripes; i++) { struct btrfs_io_stripe *stripe = &map->stripes[i]; @@ -5436,7 +5442,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma write_lock(&fs_info->mapping_tree_lock); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); write_unlock(&fs_info->mapping_tree_lock); /* Once for the tree reference. */ @@ -5472,7 +5478,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m return -EEXIST; } chunk_map_device_set_bits(map, CHUNK_ALLOCATED); - chunk_map_device_clear_bits(map, CHUNK_TRIMMED); + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); return 0; @@ -5828,7 +5834,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) map = rb_entry(node, struct btrfs_chunk_map, rb_node); rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); RB_CLEAR_NODE(&map->rb_node); - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); /* Once for the tree ref. */ btrfs_free_chunk_map(map); cond_resched_rwlock_write(&fs_info->mapping_tree_lock); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e4b3cb50f94a..ed1f8fd95fff 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -776,6 +776,7 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -888,6 +889,8 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, From b56f35560b82e7f8d79aa9ee72720b06639a473c Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:11 +0000 Subject: [PATCH 061/137] btrfs: handle setting up relocation of block group with remap-tree Handle the preliminary work for relocating a block group in a filesystem with the remap-tree flag set. If the block group is SYSTEM btrfs_relocate_block_group() proceeds as it does already, as bootstrapping issues mean that these block groups have to be processed the existing way. Similarly with METADATA_REMAP blocks, which are dealt with in a later patch. Otherwise we walk the free-space tree for the block group in question, recording any holes. These get converted into identity remaps and placed in the remap tree, and the block group's REMAPPED flag is set. From now on no new allocations are possible within this block group, and any I/O to it will be funnelled through btrfs_translate_remap(). We store the number of identity remaps in `identity_remap_count`, so that we know when we've removed the last one and the block group is fully remapped. The change in btrfs_read_roots() is because data relocations no longer rely on the data reloc tree as a hidden subvolume in which to do snapshots. (Thanks to Sun YangKai for his suggestions.) Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 +- fs/btrfs/block-group.h | 2 + fs/btrfs/free-space-tree.c | 4 +- fs/btrfs/free-space-tree.h | 5 +- fs/btrfs/relocation.c | 504 +++++++++++++++++++++++++++++++++---- fs/btrfs/relocation.h | 11 + fs/btrfs/space-info.c | 9 +- fs/btrfs/volumes.c | 84 ++++--- 8 files changed, 532 insertions(+), 93 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3b8a750d8519..dc80f147e98d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2396,6 +2396,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->used = btrfs_stack_block_group_v2_used(bgi); cache->last_used = cache->used; cache->flags = btrfs_stack_block_group_v2_flags(bgi); + cache->last_flags = cache->flags; cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi); @@ -2700,6 +2701,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, block_group->last_used = block_group->used; block_group->last_remap_bytes = block_group->remap_bytes; block_group->last_identity_remap_count = block_group->identity_remap_count; + block_group->last_flags = block_group->flags; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; @@ -3184,13 +3186,15 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, /* No change in values, can safely skip it. */ if (cache->last_used == used && cache->last_remap_bytes == remap_bytes && - cache->last_identity_remap_count == identity_remap_count) { + cache->last_identity_remap_count == identity_remap_count && + cache->last_flags == cache->flags) { spin_unlock(&cache->lock); return 0; } cache->last_used = used; cache->last_remap_bytes = remap_bytes; cache->last_identity_remap_count = identity_remap_count; + cache->last_flags = cache->flags; spin_unlock(&cache->lock); key.objectid = cache->start; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index f5c15c7f6cc7..a775c0bc40c3 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -143,6 +143,8 @@ struct btrfs_block_group { u64 last_remap_bytes; /* The last commited identity_remap_count value of this block group. */ u32 last_identity_remap_count; + /* The last committed flags value for this block group. */ + u64 last_flags; /* * If the free space extent count exceeds this number, convert the block diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index ac092898130f..96d52c031977 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -21,8 +21,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); -static struct btrfs_root *btrfs_free_space_root( - struct btrfs_block_group *block_group) +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group) { struct btrfs_key key = { .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, @@ -93,7 +92,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, return 0; } -EXPORT_FOR_TESTS struct btrfs_free_space_info *btrfs_search_free_space_info( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index ca04fc7cf29e..709730e36888 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -36,12 +36,13 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info); - -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * btrfs_search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int cow); +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e0558b2cd0b4..4d3b3854ff7f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3616,7 +3616,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) btrfs_btree_balance_dirty(fs_info); } - if (!err) { + if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) { ret = relocate_file_extent_cluster(rc); if (ret < 0) err = ret; @@ -3860,6 +3860,83 @@ static const char *stage_to_string(enum reloc_stage stage) return "unknown"; } +static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs_path *path, + struct btrfs_key *entries, unsigned int num_entries) +{ + int ret; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_item_batch batch; + u32 *data_sizes; + u32 max_items; + + max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item); + + data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items), GFP_NOFS); + if (!data_sizes) + return -ENOMEM; + + while (true) { + batch.keys = entries; + batch.data_sizes = data_sizes; + batch.total_data_size = 0; + batch.nr = min_t(u32, num_entries, max_items); + + ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path, &batch); + btrfs_release_path(path); + + if (num_entries <= max_items) + break; + + num_entries -= max_items; + entries += max_items; + } + + kfree(data_sizes); + + return ret; +} + +struct space_run { + u64 start; + u64 end; +}; + +static void parse_bitmap(u64 block_size, const unsigned long *bitmap, + unsigned long size, u64 address, struct space_run *space_runs, + unsigned int *num_space_runs) +{ + unsigned long pos, end; + u64 run_start, run_length; + + pos = find_first_bit(bitmap, size); + if (pos == size) + return; + + while (true) { + end = find_next_zero_bit(bitmap, size, pos); + + run_start = address + (pos * block_size); + run_length = (end - pos) * block_size; + + if (*num_space_runs != 0 && + space_runs[*num_space_runs - 1].end == run_start) { + space_runs[*num_space_runs - 1].end += run_length; + } else { + space_runs[*num_space_runs].start = run_start; + space_runs[*num_space_runs].end = run_start + run_length; + + (*num_space_runs)++; + } + + if (end == size) + break; + + pos = find_next_bit(bitmap, size, end + 1); + if (pos == size) + break; + } +} + static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg, s64 diff) { @@ -3889,6 +3966,186 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans, btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); } +static int create_remap_tree_entries(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_free_space_info *fsi; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct btrfs_root *space_root; + u32 extent_count; + struct space_run *space_runs = NULL; + unsigned int num_space_runs = 0; + struct btrfs_key *entries = NULL; + unsigned int max_entries, num_entries; + int ret; + + mutex_lock(&bg->free_space_lock); + + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) { + mutex_unlock(&bg->free_space_lock); + + ret = btrfs_add_block_group_free_space(trans, bg); + if (ret) + return ret; + + mutex_lock(&bg->free_space_lock); + } + + fsi = btrfs_search_free_space_info(trans, bg, path, 0); + if (IS_ERR(fsi)) { + mutex_unlock(&bg->free_space_lock); + return PTR_ERR(fsi); + } + + extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi); + + btrfs_release_path(path); + + space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS); + if (!space_runs) { + mutex_unlock(&bg->free_space_lock); + return -ENOMEM; + } + + key.objectid = bg->start; + key.type = 0; + key.offset = 0; + + space_root = btrfs_free_space_root(bg); + + ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0); + if (ret < 0) { + mutex_unlock(&bg->free_space_lock); + goto out; + } + + ret = 0; + + while (true) { + leaf = path->nodes[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid >= bg->start + bg->length) + break; + + if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + if (num_space_runs != 0 && + space_runs[num_space_runs - 1].end == found_key.objectid) { + space_runs[num_space_runs - 1].end = + found_key.objectid + found_key.offset; + } else { + ASSERT(num_space_runs < extent_count); + + space_runs[num_space_runs].start = found_key.objectid; + space_runs[num_space_runs].end = + found_key.objectid + found_key.offset; + + num_space_runs++; + } + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + void *bitmap; + unsigned long offset; + u32 data_size; + + offset = btrfs_item_ptr_offset(leaf, path->slots[0]); + data_size = btrfs_item_size(leaf, path->slots[0]); + + if (data_size != 0) { + bitmap = kmalloc(data_size, GFP_NOFS); + if (!bitmap) { + mutex_unlock(&bg->free_space_lock); + ret = -ENOMEM; + goto out; + } + + read_extent_buffer(leaf, bitmap, offset, data_size); + + parse_bitmap(fs_info->sectorsize, bitmap, + data_size * BITS_PER_BYTE, + found_key.objectid, space_runs, + &num_space_runs); + + ASSERT(num_space_runs <= extent_count); + + kfree(bitmap); + } + } + + path->slots[0]++; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(space_root, path); + if (ret != 0) { + if (ret == 1) + ret = 0; + break; + } + leaf = path->nodes[0]; + } + } + + btrfs_release_path(path); + + mutex_unlock(&bg->free_space_lock); + + max_entries = extent_count + 2; + entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS); + if (!entries) { + ret = -ENOMEM; + goto out; + } + + num_entries = 0; + + if (num_space_runs == 0) { + entries[num_entries].objectid = bg->start; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = bg->length; + num_entries++; + } else { + if (space_runs[0].start > bg->start) { + entries[num_entries].objectid = bg->start; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = space_runs[0].start - bg->start; + num_entries++; + } + + for (unsigned int i = 1; i < num_space_runs; i++) { + entries[num_entries].objectid = space_runs[i - 1].end; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = + space_runs[i].start - space_runs[i - 1].end; + num_entries++; + } + + if (space_runs[num_space_runs - 1].end < bg->start + bg->length) { + entries[num_entries].objectid = + space_runs[num_space_runs - 1].end; + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; + entries[num_entries].offset = + bg->start + bg->length - space_runs[num_space_runs - 1].end; + num_entries++; + } + + if (num_entries == 0) + goto out; + } + + bg->identity_remap_count = num_entries; + + ret = add_remap_tree_entries(trans, path, entries, num_entries); + +out: + kfree(entries); + kfree(space_runs); + + return ret; +} + static int remove_chunk_stripes(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *chunk_map, struct btrfs_path *path) @@ -4031,6 +4288,55 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans, btrfs_mark_bg_fully_remapped(bg, trans); } +static int mark_chunk_remapped(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 start) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_chunk_map *chunk_map; + struct btrfs_key key; + u64 type; + int ret; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + + read_lock(&fs_info->mapping_tree_lock); + + chunk_map = btrfs_find_chunk_map_nolock(fs_info, start, 1); + if (!chunk_map) { + read_unlock(&fs_info->mapping_tree_lock); + return -ENOENT; + } + + chunk_map->type |= BTRFS_BLOCK_GROUP_REMAPPED; + type = chunk_map->type; + + read_unlock(&fs_info->mapping_tree_lock); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = start; + + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1); + if (ret == 1) { + ret = -ENOENT; + goto end; + } else if (ret < 0) + goto end; + + leaf = path->nodes[0]; + + chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); + btrfs_set_chunk_type(leaf, chunk, type); + btrfs_mark_buffer_dirty(trans, leaf); + + ret = 0; +end: + btrfs_free_chunk_map(chunk_map); + btrfs_release_path(path); + + return ret; +} + int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length) { int ret; @@ -4081,6 +4387,133 @@ int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *leng return 0; } +static int start_block_group_remapping(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_block_group *bg) +{ + struct btrfs_trans_handle *trans; + bool bg_already_dirty = true; + int ret, ret2; + + ret = btrfs_cache_block_group(bg, true); + if (ret) + return ret; + + trans = btrfs_start_transaction(fs_info->remap_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* We need to run delayed refs, to make sure FST is up to date. */ + ret = btrfs_run_delayed_refs(trans, U64_MAX); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + + mutex_lock(&fs_info->remap_mutex); + + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) { + ret = 0; + goto end; + } + + ret = create_remap_tree_entries(trans, path, bg); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto end; + } + + spin_lock(&bg->lock); + bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED; + spin_unlock(&bg->lock); + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&bg->dirty_list)) { + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(bg); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); + + ret = mark_chunk_remapped(trans, path, bg->start); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto end; + } + + ret = btrfs_remove_block_group_free_space(trans, bg); + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); + goto end; + } + + btrfs_remove_free_space_cache(bg); + +end: + mutex_unlock(&fs_info->remap_mutex); + + ret2 = btrfs_end_transaction(trans); + if (!ret) + ret = ret2; + + return ret; +} + +static int do_nonremap_reloc(struct btrfs_fs_info *fs_info, bool verbose, + struct reloc_control *rc) +{ + int ret; + + while (1) { + enum reloc_stage finishes_stage; + + mutex_lock(&fs_info->cleaner_mutex); + ret = relocate_block_group(rc); + mutex_unlock(&fs_info->cleaner_mutex); + + finishes_stage = rc->stage; + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + int wb_ret; + + wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), + 0, (u64)-1); + if (wb_ret && ret == 0) + ret = wb_ret; + invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } + + if (ret < 0) + return ret; + + if (rc->extents_found == 0) + break; + + if (verbose) + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, stage_to_string(finishes_stage)); + } + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(rc->block_group->used > 0); + + return 0; +} + /* * function to relocate all extents in a block group. */ @@ -4091,7 +4524,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); struct reloc_control *rc; struct inode *inode; - struct btrfs_path *path; + struct btrfs_path *path = NULL; int ret; bool bg_is_ro = false; @@ -4153,7 +4586,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, } inode = lookup_free_space_inode(rc->block_group, path); - btrfs_free_path(path); + btrfs_release_path(path); if (!IS_ERR(inode)) ret = delete_block_group_cache(rc->block_group, inode, 0); @@ -4163,11 +4596,13 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, if (ret && ret != -ENOENT) goto out; - rc->data_inode = create_reloc_inode(rc->block_group); - if (IS_ERR(rc->data_inode)) { - ret = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - goto out; + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) { + rc->data_inode = create_reloc_inode(rc->block_group); + if (IS_ERR(rc->data_inode)) { + ret = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } } if (verbose) @@ -4180,54 +4615,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); - while (1) { - enum reloc_stage finishes_stage; + if (should_relocate_using_remap_tree(bg)) + ret = start_block_group_remapping(fs_info, path, bg); + else + ret = do_nonremap_reloc(fs_info, verbose, rc); - mutex_lock(&fs_info->cleaner_mutex); - ret = relocate_block_group(rc); - mutex_unlock(&fs_info->cleaner_mutex); - - finishes_stage = rc->stage; - /* - * We may have gotten ENOSPC after we already dirtied some - * extents. If writeout happens while we're relocating a - * different block group we could end up hitting the - * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in - * btrfs_reloc_cow_block. Make sure we write everything out - * properly so we don't trip over this problem, and then break - * out of the loop if we hit an error. - */ - if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - int wb_ret; - - wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, - (u64)-1); - if (wb_ret && ret == 0) - ret = wb_ret; - invalidate_mapping_pages(rc->data_inode->i_mapping, - 0, -1); - rc->stage = UPDATE_DATA_PTRS; - } - - if (ret < 0) - goto out; - - if (rc->extents_found == 0) - break; - - if (verbose) - btrfs_info(fs_info, "found %llu extents, stage: %s", - rc->extents_found, - stage_to_string(finishes_stage)); - } - - WARN_ON(rc->block_group->pinned > 0); - WARN_ON(rc->block_group->reserved > 0); - WARN_ON(rc->block_group->used > 0); out: if (ret && bg_is_ro) btrfs_dec_block_group_ro(rc->block_group); - iput(rc->data_inode); + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) + iput(rc->data_inode); + btrfs_free_path(path); reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); @@ -4421,7 +4819,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) btrfs_free_path(path); - if (ret == 0) { + if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) { /* cleanup orphan inode in data relocation tree */ fs_root = btrfs_grab_root(fs_info->data_reloc_root); ASSERT(fs_root); diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 3afb6f85b722..d647823b5d13 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -12,6 +12,17 @@ struct btrfs_trans_handle; struct btrfs_ordered_extent; struct btrfs_pending_snapshot; +static inline bool should_relocate_using_remap_tree(const struct btrfs_block_group *bg) +{ + if (!btrfs_fs_incompat(bg->fs_info, REMAP_TREE)) + return false; + + if (bg->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) + return false; + + return true; +} + int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, bool verbose); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2c9cf1ab232b..ebe97d6d67d3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -380,8 +380,13 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, factor = btrfs_bg_type_to_factor(block_group->flags); spin_lock(&space_info->lock); - space_info->total_bytes += block_group->length; - space_info->disk_total += block_group->length * factor; + + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) || + block_group->identity_remap_count != 0) { + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + } + space_info->bytes_used += block_group->used; space_info->disk_used += block_group->used * factor; space_info->bytes_readonly += block_group->bytes_super; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4884c7b62c61..e85ffeda006d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3400,15 +3400,50 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return ret; } -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, - bool verbose) +static int btrfs_relocate_chunk_finish(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *bg) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; - struct btrfs_block_group *block_group; u64 length; int ret; + btrfs_discard_cancel_work(&fs_info->discard_ctl, bg); + length = bg->length; + btrfs_put_block_group(bg); + + /* + * On a zoned file system, discard the whole block group, this will + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If + * resetting the zone fails, don't treat it as a fatal problem from the + * filesystem's point of view. + */ + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_discard_extent(fs_info, bg->start, length, NULL); + if (ret) + btrfs_info(fs_info, "failed to reset zone %llu after relocation", + bg->start); + } + + trans = btrfs_start_trans_remove_block_group(root->fs_info, bg->start); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_handle_fs_error(root->fs_info, ret, NULL); + return ret; + } + + /* Step two, delete the device extents and the chunk tree entries. */ + ret = btrfs_remove_chunk(trans, bg->start); + btrfs_end_transaction(trans); + + return ret; +} + +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, bool verbose) +{ + struct btrfs_block_group *block_group; + int ret; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "relocate: not supported on extent tree v2 yet"); @@ -3446,38 +3481,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, block_group = btrfs_lookup_block_group(fs_info, chunk_offset); if (!block_group) return -ENOENT; - btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - length = block_group->length; - btrfs_put_block_group(block_group); - /* - * On a zoned file system, discard the whole block group, this will - * trigger a REQ_OP_ZONE_RESET operation on the device zone. If - * resetting the zone fails, don't treat it as a fatal problem from the - * filesystem's point of view. - */ - if (btrfs_is_zoned(fs_info)) { - ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); - if (ret) - btrfs_info(fs_info, - "failed to reset zone %llu after relocation", - chunk_offset); + if (should_relocate_using_remap_tree(block_group)) { + /* If we're relocating using the remap tree we're now done. */ + btrfs_put_block_group(block_group); + ret = 0; + } else { + ret = btrfs_relocate_chunk_finish(fs_info, block_group); } - trans = btrfs_start_trans_remove_block_group(root->fs_info, - chunk_offset); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_handle_fs_error(root->fs_info, ret, NULL); - return ret; - } - - /* - * step two, delete the device extents and the - * chunk tree entries - */ - ret = btrfs_remove_chunk(trans, chunk_offset); - btrfs_end_transaction(trans); return ret; } @@ -4150,6 +4162,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); chunk_type = btrfs_chunk_type(leaf, chunk); + /* Check if chunk has already been fully relocated. */ + if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED && + btrfs_chunk_num_stripes(leaf, chunk) == 0) { + btrfs_release_path(path); + mutex_unlock(&fs_info->reclaim_bgs_lock); + goto loop; + } + if (!counting) { spin_lock(&fs_info->balance_lock); bctl->stat.considered++; From bbea42dfb91f6901243958c83f26bbbd3a4a85fa Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:12 +0000 Subject: [PATCH 062/137] btrfs: move existing remaps before relocating block group If when relocating a block group we find that `remap_bytes` > 0 in its block group item, that means that it has been the destination block group for another that has been remapped. We need to search the remap tree for any remap backrefs within this range, and move the data to a third block group. This is because otherwise btrfs_translate_remap() could end up following an unbounded chain of remaps, which would only get worse over time. We only relocate one block group at a time, so `remap_bytes` will only ever go down while we are doing this. Once we're finished we set the REMAPPED flag on the block group, which will permanently prevent any other data from being moved to within it. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- fs/btrfs/bio.h | 3 + fs/btrfs/extent-tree.c | 6 +- fs/btrfs/relocation.c | 461 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 467 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d46f39996469..d3475d179362 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -821,7 +821,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) */ if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { + !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) { if (should_async_write(bbio) && btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) goto done; diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 157cdfa2f78a..303ed6c7103d 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -90,6 +90,9 @@ struct btrfs_bio { */ bool is_scrub:1; + /* Whether the bio is coming from copy_remapped_data_io(). */ + bool is_remap:1; + /* Whether the csum generation for data write is async. */ bool async_csum:1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e3e9f18b263..ebff087b4e89 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4557,7 +4557,8 @@ static noinline int find_free_extent(struct btrfs_root *root, block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || - block_group->ro) { + block_group->ro || + (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { /* * someone is removing this block group, * we can't jump into the have_block_group @@ -4591,7 +4592,8 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) { + if (unlikely(block_group->ro || + (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) { if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); if (ffe_ctl->for_data_reloc) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4d3b3854ff7f..a1558ee92d29 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3966,6 +3966,457 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans, btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); } +/* Private structure for I/O from copy_remapped_data(). */ +struct reloc_io_private { + struct completion done; + refcount_t pending_refs; + blk_status_t status; +}; + +static void reloc_endio(struct btrfs_bio *bbio) +{ + struct reloc_io_private *priv = bbio->private; + + if (bbio->bio.bi_status) + WRITE_ONCE(priv->status, bbio->bio.bi_status); + + if (refcount_dec_and_test(&priv->pending_refs)) + complete(&priv->done); + + bio_put(&bbio->bio); +} + +static int copy_remapped_data_io(struct btrfs_fs_info *fs_info, + struct reloc_io_private *priv, + struct page **pages, u64 addr, u64 length, + blk_opf_t op) +{ + struct btrfs_bio *bbio; + int i; + + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); + priv->status = 0; + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, BTRFS_I(fs_info->btree_inode), + addr, reloc_endio, priv); + bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT); + bbio->is_remap = true; + + i = 0; + do { + size_t bytes = min_t(u64, length, PAGE_SIZE); + + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, + BTRFS_I(fs_info->btree_inode), + addr, reloc_endio, priv); + bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT); + bbio->is_remap = true; + continue; + } + + i++; + addr += bytes; + length -= bytes; + } while (length); + + refcount_inc(&priv->pending_refs); + btrfs_submit_bbio(bbio, 0); + + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); + + return blk_status_to_errno(READ_ONCE(priv->status)); +} + +static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr, + u64 new_addr, u64 length) +{ + int ret; + u64 copy_len = min_t(u64, length, SZ_1M); + struct page **pages; + struct reloc_io_private priv; + unsigned int nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto end; + } + + /* Copy 1MB at a time, to avoid using too much memory. */ + do { + u64 to_copy = min_t(u64, length, copy_len); + + /* Limit to one bio. */ + to_copy = min_t(u64, to_copy, BIO_MAX_VECS << PAGE_SHIFT); + + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, + to_copy, REQ_OP_READ); + if (ret) + goto end; + + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, + to_copy, REQ_OP_WRITE); + if (ret) + goto end; + + if (to_copy == length) + break; + + old_addr += to_copy; + new_addr += to_copy; + length -= to_copy; + } while (true); + + ret = 0; +end: + for (int i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + + return ret; +} + +static int add_remap_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 new_addr, u64 length, + u64 old_addr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_remap_item remap = { 0 }; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = old_addr; + key.type = BTRFS_REMAP_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, + &key, sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + leaf = path->nodes[0]; + btrfs_set_stack_remap_address(&remap, new_addr); + write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_remap_item)); + + btrfs_release_path(path); + + return 0; +} + +static int add_remap_backref_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 new_addr, + u64 length, u64 old_addr) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_remap_item remap = { 0 }; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, &key, + sizeof(struct btrfs_remap_item)); + if (ret) + return ret; + + leaf = path->nodes[0]; + btrfs_set_stack_remap_address(&remap, old_addr); + write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_remap_item)); + + btrfs_release_path(path); + + return 0; +} + +static int move_existing_remap(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_block_group *bg, u64 new_addr, + u64 length, u64 old_addr) +{ + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_remap_item *remap_ptr; + struct btrfs_remap_item remap = { 0 }; + struct btrfs_key key, ins; + u64 dest_addr, dest_length, min_size; + struct btrfs_block_group *dest_bg; + int ret; + const bool is_data = (bg->flags & BTRFS_BLOCK_GROUP_DATA); + struct btrfs_space_info *sinfo = bg->space_info; + bool mutex_taken = false; + bool bg_needs_free_space; + + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, length); + spin_unlock(&sinfo->lock); + + if (is_data) + min_size = fs_info->sectorsize; + else + min_size = fs_info->nodesize; + + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size, + 0, 0, &ins, is_data, false); + if (unlikely(ret)) { + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, -length); + spin_unlock(&sinfo->lock); + return ret; + } + + dest_addr = ins.objectid; + dest_length = ins.offset; + + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) { + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize); + + btrfs_free_reserved_extent(fs_info, dest_addr + new_length, + dest_length - new_length, 0); + + dest_length = new_length; + } + + trans = btrfs_join_transaction(fs_info->remap_root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + trans = NULL; + goto end; + } + + mutex_lock(&fs_info->remap_mutex); + mutex_taken = true; + + /* Find old remap entry. */ + key.objectid = old_addr; + key.type = BTRFS_REMAP_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); + if (ret == 1) { + /* + * Not a problem if the remap entry wasn't found: that means + * that another transaction has deallocated the data. + * move_existing_remaps() loops until the BG contains no + * remaps, so we can just return 0 in this case. + */ + btrfs_release_path(path); + ret = 0; + goto end; + } else if (unlikely(ret)) { + goto end; + } + + ret = copy_remapped_data(fs_info, new_addr, dest_addr, dest_length); + if (unlikely(ret)) + goto end; + + /* Change data of old remap entry. */ + leaf = path->nodes[0]; + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + btrfs_set_remap_address(leaf, remap_ptr, dest_addr); + btrfs_mark_buffer_dirty(trans, leaf); + + if (dest_length != length) { + key.offset = dest_length; + btrfs_set_item_key_safe(trans, path, &key); + } + + btrfs_release_path(path); + + if (dest_length != length) { + /* Add remap item for remainder. */ + ret = add_remap_item(trans, path, new_addr + dest_length, + length - dest_length, old_addr + dest_length); + if (unlikely(ret)) + goto end; + } + + /* Change or remove old backref. */ + key.objectid = new_addr; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (unlikely(ret)) { + if (ret == 1) { + btrfs_release_path(path); + ret = -ENOENT; + } + goto end; + } + + leaf = path->nodes[0]; + + if (dest_length == length) { + ret = btrfs_del_item(trans, fs_info->remap_root, path); + if (unlikely(ret)) { + btrfs_release_path(path); + goto end; + } + } else { + key.objectid += dest_length; + key.offset -= dest_length; + btrfs_set_item_key_safe(trans, path, &key); + btrfs_set_stack_remap_address(&remap, old_addr + dest_length); + + write_extent_buffer(leaf, &remap, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_remap_item)); + } + + btrfs_release_path(path); + + /* Add new backref. */ + ret = add_remap_backref_item(trans, path, dest_addr, dest_length, old_addr); + if (unlikely(ret)) + goto end; + + adjust_block_group_remap_bytes(trans, bg, -dest_length); + + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length); + if (unlikely(ret)) + goto end; + + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr); + + adjust_block_group_remap_bytes(trans, dest_bg, dest_length); + + mutex_lock(&dest_bg->free_space_lock); + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &dest_bg->runtime_flags); + mutex_unlock(&dest_bg->free_space_lock); + btrfs_put_block_group(dest_bg); + + if (bg_needs_free_space) { + ret = btrfs_add_block_group_free_space(trans, dest_bg); + if (unlikely(ret)) + goto end; + } + + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length); + if (unlikely(ret)) { + btrfs_remove_from_free_space_tree(trans, new_addr, dest_length); + goto end; + } + + ret = 0; + +end: + if (mutex_taken) + mutex_unlock(&fs_info->remap_mutex); + + btrfs_dec_block_group_reservations(fs_info, dest_addr); + + if (unlikely(ret)) { + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0); + + if (trans) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + } else { + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr); + btrfs_free_reserved_bytes(dest_bg, dest_length, 0); + btrfs_put_block_group(dest_bg); + + ret = btrfs_commit_transaction(trans); + } + + return ret; +} + +static int move_existing_remaps(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *bg, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_remap_item *remap; + u64 old_addr; + + /* Look for backrefs in remap tree. */ + while (bg->remap_bytes > 0) { + key.objectid = bg->start; + key.type = BTRFS_REMAP_BACKREF_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(fs_info->remap_root, path); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + + if (ret) { + btrfs_release_path(path); + break; + } + + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type != BTRFS_REMAP_BACKREF_KEY) { + path->slots[0]++; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(fs_info->remap_root, path); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + + if (ret) { + btrfs_release_path(path); + break; + } + + leaf = path->nodes[0]; + } + } + + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); + old_addr = btrfs_remap_address(leaf, remap); + + btrfs_release_path(path); + + ret = move_existing_remap(fs_info, path, bg, key.objectid, + key.offset, old_addr); + if (ret) + return ret; + } + + ASSERT(bg->remap_bytes == 0); + + return 0; +} + static int create_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_block_group *bg) @@ -4615,10 +5066,16 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, ret = btrfs_zone_finish(rc->block_group); WARN_ON(ret && ret != -EAGAIN); - if (should_relocate_using_remap_tree(bg)) + if (should_relocate_using_remap_tree(bg)) { + if (bg->remap_bytes != 0) { + ret = move_existing_remaps(fs_info, bg, path); + if (ret) + goto out; + } ret = start_block_group_remapping(fs_info, path, bg); - else + } else { ret = do_nonremap_reloc(fs_info, verbose, rc); + } out: if (ret && bg_is_ro) From fd6594b1446cc753450bad8d0da6288da1ad7b96 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:13 +0000 Subject: [PATCH 063/137] btrfs: replace identity remaps with actual remaps when doing relocations Add a function do_remap_tree_reloc(), which does the actual work of doing a relocation using the remap tree. In a loop we call do_remap_reloc_trans(), which searches for the first identity remap for the block group. We call btrfs_reserve_extent() to find space elsewhere for it, and read the data into memory and write it to the new location. We then carve out the identity remap and replace it with an actual remap, which points to the new location in which to look. Once the last identity remap has been removed we call last_identity_remap_gone(), which, as with deletions, removes the chunk's stripes and device extents. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 326 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a1558ee92d29..3f8017ad0033 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4597,6 +4597,60 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans, return ret; } +static int find_next_identity_remap(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 bg_end, + u64 last_start, u64 *start, u64 *length) +{ + int ret; + struct btrfs_key key, found_key; + struct btrfs_root *remap_root = trans->fs_info->remap_root; + struct extent_buffer *leaf; + + key.objectid = last_start; + key.type = BTRFS_IDENTITY_REMAP_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + while (true) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(remap_root, path); + + if (ret != 0) { + if (ret == 1) + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid >= bg_end) { + ret = -ENOENT; + goto out; + } + + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) { + *start = found_key.objectid; + *length = found_key.offset; + ret = 0; + goto out; + } + + path->slots[0]++; + } + +out: + btrfs_release_path(path); + + return ret; +} + static int remove_chunk_stripes(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *chunk_map, struct btrfs_path *path) @@ -4739,6 +4793,92 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans, btrfs_mark_bg_fully_remapped(bg, trans); } +static int add_remap_entry(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *src_bg, u64 old_addr, + u64 new_addr, u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key, new_key; + int ret; + int identity_count_delta = 0; + + key.objectid = old_addr; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); + if (ret < 0) + goto end; + + if (path->slots[0] == 0) { + ret = -ENOENT; + goto end; + } + + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_IDENTITY_REMAP_KEY || + key.objectid > old_addr || + key.objectid + key.offset <= old_addr) { + ret = -ENOENT; + goto end; + } + + /* Shorten or delete identity mapping entry. */ + if (key.objectid == old_addr) { + ret = btrfs_del_item(trans, fs_info->remap_root, path); + if (ret) + goto end; + + identity_count_delta--; + } else { + new_key.objectid = key.objectid; + new_key.type = BTRFS_IDENTITY_REMAP_KEY; + new_key.offset = old_addr - key.objectid; + + btrfs_set_item_key_safe(trans, path, &new_key); + } + + btrfs_release_path(path); + + /* Create new remap entry. */ + ret = add_remap_item(trans, path, new_addr, length, old_addr); + if (ret) + goto end; + + /* Add entry for remainder of identity mapping, if necessary. */ + if (key.objectid + key.offset != old_addr + length) { + new_key.objectid = old_addr + length; + new_key.type = BTRFS_IDENTITY_REMAP_KEY; + new_key.offset = key.objectid + key.offset - old_addr - length; + + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, + path, &new_key, 0); + if (ret) + goto end; + + btrfs_release_path(path); + + identity_count_delta++; + } + + /* Add backref. */ + ret = add_remap_backref_item(trans, path, new_addr, length, old_addr); + if (ret) + goto end; + + if (identity_count_delta != 0) + adjust_identity_remap_count(trans, src_bg, identity_count_delta); + +end: + btrfs_release_path(path); + + return ret; +} + static int mark_chunk_remapped(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 start) { @@ -4788,6 +4928,184 @@ static int mark_chunk_remapped(struct btrfs_trans_handle *trans, return ret; } +static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info, + struct btrfs_block_group *src_bg, + struct btrfs_path *path, u64 *last_start) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *extent_root; + struct btrfs_key ins; + struct btrfs_block_group *dest_bg = NULL; + u64 start = 0, remap_length = 0; + u64 length, new_addr, min_size; + int ret; + const bool is_data = (src_bg->flags & BTRFS_BLOCK_GROUP_DATA); + bool no_more = false; + bool made_reservation = false, bg_needs_free_space; + struct btrfs_space_info *sinfo = src_bg->space_info; + + extent_root = btrfs_extent_root(fs_info, src_bg->start); + + trans = btrfs_start_transaction(extent_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + mutex_lock(&fs_info->remap_mutex); + + ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length, + *last_start, &start, &remap_length); + if (ret == -ENOENT) { + no_more = true; + goto next; + } else if (ret) { + mutex_unlock(&fs_info->remap_mutex); + btrfs_end_transaction(trans); + return ret; + } + + /* Try to reserve enough space for block. */ + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, remap_length); + spin_unlock(&sinfo->lock); + + if (is_data) + min_size = fs_info->sectorsize; + else + min_size = fs_info->nodesize; + + /* + * We're using btrfs_reserve_extent() to allocate a contiguous + * logical address range, but this will become a remap item rather than + * an extent in the extent tree. + * + * Short allocations are fine: it means that we chop off the beginning + * of the identity remap that we're processing, and will tackle the + * rest of it the next time round. + */ + ret = btrfs_reserve_extent(fs_info->fs_root, remap_length, remap_length, + min_size, 0, 0, &ins, is_data, false); + if (ret) { + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(sinfo, -remap_length); + spin_unlock(&sinfo->lock); + + mutex_unlock(&fs_info->remap_mutex); + btrfs_end_transaction(trans); + return ret; + } + + made_reservation = true; + + new_addr = ins.objectid; + length = ins.offset; + + if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) { + u64 new_length = ALIGN_DOWN(length, fs_info->nodesize); + + btrfs_free_reserved_extent(fs_info, new_addr + new_length, + length - new_length, 0); + + length = new_length; + } + + dest_bg = btrfs_lookup_block_group(fs_info, new_addr); + + mutex_lock(&dest_bg->free_space_lock); + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &dest_bg->runtime_flags); + mutex_unlock(&dest_bg->free_space_lock); + + if (bg_needs_free_space) { + ret = btrfs_add_block_group_free_space(trans, dest_bg); + if (ret) + goto fail; + } + + ret = copy_remapped_data(fs_info, start, new_addr, length); + if (ret) + goto fail; + + ret = btrfs_remove_from_free_space_tree(trans, new_addr, length); + if (ret) + goto fail; + + ret = add_remap_entry(trans, path, src_bg, start, new_addr, length); + if (ret) { + btrfs_add_to_free_space_tree(trans, new_addr, length); + goto fail; + } + + adjust_block_group_remap_bytes(trans, dest_bg, length); + btrfs_free_reserved_bytes(dest_bg, length, 0); + + spin_lock(&sinfo->lock); + sinfo->bytes_readonly += length; + spin_unlock(&sinfo->lock); + +next: + if (dest_bg) + btrfs_put_block_group(dest_bg); + + if (made_reservation) + btrfs_dec_block_group_reservations(fs_info, new_addr); + + mutex_unlock(&fs_info->remap_mutex); + + if (src_bg->identity_remap_count == 0) { + bool mark_fully_remapped = false; + + spin_lock(&src_bg->lock); + if (!test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags)) { + mark_fully_remapped = true; + set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags); + } + spin_unlock(&src_bg->lock); + + if (mark_fully_remapped) + btrfs_mark_bg_fully_remapped(src_bg, trans); + } + + ret = btrfs_end_transaction(trans); + if (ret) + return ret; + + if (no_more) + return 1; + + *last_start = start; + + return 0; + +fail: + if (dest_bg) + btrfs_put_block_group(dest_bg); + + btrfs_free_reserved_extent(fs_info, new_addr, length, 0); + + mutex_unlock(&fs_info->remap_mutex); + btrfs_end_transaction(trans); + + return ret; +} + +static int do_remap_reloc(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_block_group *bg) +{ + u64 last_start = bg->start; + int ret; + + while (true) { + ret = do_remap_reloc_trans(fs_info, bg, path, &last_start); + if (ret) { + if (ret == 1) + ret = 0; + break; + } + } + + return ret; +} + int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length) { int ret; @@ -5073,6 +5391,14 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, goto out; } ret = start_block_group_remapping(fs_info, path, bg); + if (ret) + goto out; + + ret = do_remap_reloc(fs_info, path, rc->block_group); + if (ret) + goto out; + + btrfs_delete_unused_bgs(fs_info); } else { ret = do_nonremap_reloc(fs_info, verbose, rc); } From a645372e7e40be088828ad99aa9a6c68f83ef00d Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:14 +0000 Subject: [PATCH 064/137] btrfs: add do_remap parameter to btrfs_discard_extent() btrfs_discard_extent() can be called either when an extent is removed or from walking the free-space tree. With a remapped block group these two things are no longer equivalent: the extent's addresses are remapped, while the free-space tree exclusively uses underlying addresses. Add a do_remap parameter to btrfs_discard_extent() and btrfs_map_discard(), saying whether or not the address needs to be run through the remap tree first. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 +++++---- fs/btrfs/extent-tree.h | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/volumes.c | 22 ++++++++++++++++++++-- fs/btrfs/volumes.h | 2 +- 6 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ebff087b4e89..c063c5b6c433 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1381,7 +1381,7 @@ static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) } int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) + u64 num_bytes, u64 *actual_bytes, bool do_remap) { int ret = 0; u64 discarded_bytes = 0; @@ -1399,7 +1399,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, int i; num_bytes = end - cur; - stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes, + do_remap); if (IS_ERR(stripes)) { ret = PTR_ERR(stripes); if (ret == -EOPNOTSUPP) @@ -2936,7 +2937,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, - end + 1 - start, NULL); + end + 1 - start, NULL, true); next_state = btrfs_next_extent_state(unpin, cached_state); btrfs_clear_extent_dirty(unpin, start, end, &cached_state); @@ -2994,7 +2995,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = -EROFS; if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, - block_group->length, NULL); + block_group->length, NULL, true); /* * Not strictly necessary to lock, as the block_group should be diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index d7b6aeb63656..ff330d4896d6 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -161,7 +161,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent); void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); + u64 num_bytes, u64 *actual_bytes, bool do_remap); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info); int btrfs_complete_bg_remapping(struct btrfs_block_group *bg); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8d4db3d57cf7..17e79ee3e021 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3677,7 +3677,7 @@ static int do_trimming(struct btrfs_block_group *block_group, } spin_unlock(&space_info->lock); - ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); + ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false); if (!ret) { *total_trimmed += trimmed; trim_state = BTRFS_TRIM_STATE_TRIMMED; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index be47aa58e944..691aa5119c0b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3366,7 +3366,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_discard_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, - NULL); + NULL, true); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, true); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e85ffeda006d..d42b8d50aea2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3419,7 +3419,7 @@ static int btrfs_relocate_chunk_finish(struct btrfs_fs_info *fs_info, * filesystem's point of view. */ if (btrfs_is_zoned(fs_info)) { - ret = btrfs_discard_extent(fs_info, bg->start, length, NULL); + ret = btrfs_discard_extent(fs_info, bg->start, length, NULL, true); if (ret) btrfs_info(fs_info, "failed to reset zone %llu after relocation", bg->start); @@ -6101,7 +6101,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) */ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - u32 *num_stripes) + u32 *num_stripes, bool do_remap) { struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; @@ -6125,6 +6125,24 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, if (IS_ERR(map)) return ERR_CAST(map); + if (do_remap && (map->type & BTRFS_BLOCK_GROUP_REMAPPED)) { + u64 new_logical = logical; + + ret = btrfs_translate_remap(fs_info, &new_logical, &length); + if (ret) + goto out_free_map; + + if (new_logical != logical) { + btrfs_free_chunk_map(map); + + map = btrfs_get_chunk_map(fs_info, new_logical, length); + if (IS_ERR(map)) + return ERR_CAST(map); + + logical = new_logical; + } + } + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ed1f8fd95fff..e4644352314a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -714,7 +714,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, u32 length, int mirror_num); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - u32 *num_stripes); + u32 *num_stripes, bool do_remap); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, From 81e5a4551c32b454468f5aa3fe45dabb6bccb854 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:15 +0000 Subject: [PATCH 065/137] btrfs: allow balancing remap tree Balancing the METADATA_REMAP chunk, i.e. the chunk in which the remap tree lives, is a special case. We can't use the remap tree itself for this, as then we'd have no way to boostrap it on mount. And we can't use the pre-remap tree code for this as it relies on walking the extent tree, and we're not creating backrefs for METADATA_REMAP chunks. So instead, if a balance would relocate any METADATA_REMAP block groups, mark those block groups as readonly and COW every leaf of the remap tree. There's more sophisticated ways of doing this, such as only COWing nodes within a block group that's to be relocated, but they're fiddly and with lots of edge cases. Plus it's not anticipated that a) the number of METADATA_REMAP chunks is going to be particularly large, or b) that users will want to only relocate some of these chunks - the main use case here is to unbreak RAID conversion and device removal. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 152 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 148 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d42b8d50aea2..af0197b242a7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3990,8 +3990,11 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); - if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) - return false; + /* Treat METADATA_REMAP chunks as METADATA. */ + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) { + chunk_type &= ~BTRFS_BLOCK_GROUP_METADATA_REMAP; + chunk_type |= BTRFS_BLOCK_GROUP_METADATA; + } /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & @@ -4074,6 +4077,107 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk return true; } +struct remap_chunk_info { + struct list_head list; + u64 offset; + struct btrfs_block_group *bg; + bool made_ro; +}; + +static int cow_remap_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key key = { 0 }; + int ret; + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); + if (ret < 0) + return ret; + + while (true) { + ret = btrfs_next_leaf(fs_info->remap_root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + ret = 0; + break; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + btrfs_release_path(path); + + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); + if (ret < 0) + break; + } + + return ret; +} + +static int balance_remap_chunks(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct list_head *chunks) +{ + struct remap_chunk_info *rci, *tmp; + struct btrfs_trans_handle *trans; + int ret; + + list_for_each_entry_safe(rci, tmp, chunks, list) { + rci->bg = btrfs_lookup_block_group(fs_info, rci->offset); + if (!rci->bg) { + list_del(&rci->list); + kfree(rci); + continue; + } + + ret = btrfs_inc_block_group_ro(rci->bg, false); + if (ret) + goto end; + + rci->made_ro = true; + } + + if (list_empty(chunks)) + return 0; + + trans = btrfs_start_transaction(fs_info->remap_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto end; + } + + mutex_lock(&fs_info->remap_mutex); + ret = cow_remap_tree(trans, path); + mutex_unlock(&fs_info->remap_mutex); + + btrfs_release_path(path); + btrfs_commit_transaction(trans); + +end: + while (!list_empty(chunks)) { + bool is_unused; + + rci = list_first_entry(chunks, struct remap_chunk_info, list); + + spin_lock(&rci->bg->lock); + is_unused = !btrfs_is_block_group_used(rci->bg); + spin_unlock(&rci->bg->lock); + + if (is_unused) + btrfs_mark_bg_unused(rci->bg); + + if (rci->made_ro) + btrfs_dec_block_group_ro(rci->bg); + + btrfs_put_block_group(rci->bg); + + list_del(&rci->list); + kfree(rci); + } + + return ret; +} + static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -4096,6 +4200,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + struct remap_chunk_info *rci; + unsigned int num_remap_chunks = 0; + LIST_HEAD(remap_chunks); path = btrfs_alloc_path(); if (!path) { @@ -4194,7 +4301,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) count_data++; else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) count_sys++; - else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + else if (chunk_type & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_METADATA_REMAP)) count_meta++; goto loop; @@ -4214,6 +4322,29 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) goto loop; } + /* + * Balancing METADATA_REMAP chunks takes place separately - add + * the details to a list so it can be processed later. + */ + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) { + mutex_unlock(&fs_info->reclaim_bgs_lock); + + rci = kmalloc(sizeof(struct remap_chunk_info), GFP_NOFS); + if (!rci) { + ret = -ENOMEM; + goto error; + } + + rci->offset = found_key.offset; + rci->bg = NULL; + rci->made_ro = false; + list_add_tail(&rci->list, &remap_chunks); + + num_remap_chunks++; + + goto loop; + } + if (!chunk_reserved) { /* * We may be relocating the only data chunk we have, @@ -4253,11 +4384,24 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) key.offset = found_key.offset - 1; } + btrfs_release_path(path); + if (counting) { - btrfs_release_path(path); counting = false; goto again; } + + if (!list_empty(&remap_chunks)) { + ret = balance_remap_chunks(fs_info, path, &remap_chunks); + if (ret == -ENOSPC) + enospc_errors++; + + if (!ret) { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed += num_remap_chunks; + spin_unlock(&fs_info->balance_lock); + } + } error: if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", From 7cddbb4339d4be16aa5341e3a27e63c34d2c4e0d Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:16 +0000 Subject: [PATCH 066/137] btrfs: handle discarding fully-remapped block groups Discard normally works by iterating over the free-space entries of a block group. This doesn't work for fully-remapped block groups, as we removed their free-space entries when we started relocation. For sync discard, call btrfs_discard_extent() when we commit the transaction in which the last identity remap was removed. For async discard, add a new function btrfs_trim_fully_remapped_block_group() to be called by the discard worker, which iterates over the block group's range using the normal async discard rules. Once we reach the end, remove the chunk's stripes and device extents to get back its free space. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 30 +++++++++++---------- fs/btrfs/block-group.h | 1 + fs/btrfs/discard.c | 52 ++++++++++++++++++++++++++++++++----- fs/btrfs/extent-tree.c | 2 ++ fs/btrfs/free-space-cache.c | 33 +++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 1 + 6 files changed, 99 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index dc80f147e98d..cfc1e363e3ec 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4792,18 +4792,22 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, { struct btrfs_fs_info *fs_info = trans->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - /* - * The block group might already be on the unused_bgs list, remove it - * if it is. It'll get readded after the async discard worker finishes, - * or in btrfs_handle_fully_remapped_bgs() if we're not using async - * discard. - */ - if (!list_empty(&bg->bg_list)) - list_del(&bg->bg_list); - else - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); - spin_unlock(&fs_info->unused_bgs_lock); + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_queue_work(&fs_info->discard_ctl, bg); + } else { + spin_lock(&fs_info->unused_bgs_lock); + /* + * The block group might already be on the unused_bgs list, + * remove it if it is. It'll get readded after + * btrfs_handle_fully_remapped_bgs() finishes. + */ + if (!list_empty(&bg->bg_list)) + list_del(&bg->bg_list); + else + btrfs_get_block_group(bg); + + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + } } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a775c0bc40c3..29d6e682accd 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -49,6 +49,7 @@ enum btrfs_discard_state { BTRFS_DISCARD_EXTENTS, BTRFS_DISCARD_BITMAPS, BTRFS_DISCARD_RESET_CURSOR, + BTRFS_DISCARD_FULLY_REMAPPED, }; /* diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index ee5f5b2788e1..1c304bf473e5 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -215,6 +215,25 @@ static struct btrfs_block_group *find_next_block_group( return ret_block_group; } +/* + * Check whether a block group is empty. + * + * "Empty" here means that there are no extents physically located within the + * device extents corresponding to this block group. + * + * For a remapped block group, this means that all of its identity remaps have + * been removed. For a non-remapped block group, this means that no extents + * have an address within its range, and that nothing has been remapped to be + * within it. + */ +static bool block_group_is_empty(const struct btrfs_block_group *bg) +{ + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) + return bg->identity_remap_count == 0; + + return bg->used == 0 && bg->remap_bytes == 0; +} + /* * Look up next block group and set it for use. * @@ -241,8 +260,10 @@ static struct btrfs_block_group *peek_discard_list( block_group = find_next_block_group(discard_ctl, now); if (block_group && now >= block_group->discard_eligible_time) { + const bool empty = block_group_is_empty(block_group); + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && - block_group->used != 0) { + !empty) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); /* @@ -267,7 +288,12 @@ static struct btrfs_block_group *peek_discard_list( } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { block_group->discard_cursor = block_group->start; - block_group->discard_state = BTRFS_DISCARD_EXTENTS; + + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && empty) { + block_group->discard_state = BTRFS_DISCARD_FULLY_REMAPPED; + } else { + block_group->discard_state = BTRFS_DISCARD_EXTENTS; + } } } if (block_group) { @@ -373,7 +399,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; - if (block_group->used == 0 && block_group->remap_bytes == 0) + if (block_group_is_empty(block_group)) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); @@ -470,7 +496,7 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, { remove_from_discard_list(discard_ctl, block_group); - if (block_group->used == 0) { + if (block_group_is_empty(block_group)) { if (btrfs_is_free_space_trimmed(block_group)) btrfs_mark_bg_unused(block_group); else @@ -524,7 +550,8 @@ static void btrfs_discard_workfn(struct work_struct *work) /* Perform discarding */ minlen = discard_minlen[discard_index]; - if (discard_state == BTRFS_DISCARD_BITMAPS) { + switch (discard_state) { + case BTRFS_DISCARD_BITMAPS: { u64 maxlen = 0; /* @@ -541,17 +568,28 @@ static void btrfs_discard_workfn(struct work_struct *work) btrfs_block_group_end(block_group), minlen, maxlen, true); discard_ctl->discard_bitmap_bytes += trimmed; - } else { + + break; + } + + case BTRFS_DISCARD_FULLY_REMAPPED: + btrfs_trim_fully_remapped_block_group(block_group); + break; + + default: btrfs_trim_block_group_extents(block_group, &trimmed, block_group->discard_cursor, btrfs_block_group_end(block_group), minlen, true); discard_ctl->discard_extent_bytes += trimmed; + + break; } /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { - if (discard_state == BTRFS_DISCARD_BITMAPS) { + if (discard_state == BTRFS_DISCARD_BITMAPS || + discard_state == BTRFS_DISCARD_FULLY_REMAPPED) { btrfs_finish_discard_pass(discard_ctl, block_group); } else { block_group->discard_cursor = block_group->start; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c063c5b6c433..6fab7765057e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2905,6 +2905,8 @@ void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info) list_del_init(&bg->bg_list); spin_unlock(&fs_info->unused_bgs_lock); + btrfs_discard_extent(fs_info, bg->start, bg->length, NULL, false); + ret = btrfs_complete_bg_remapping(bg); if (ret) { btrfs_put_block_group(bg); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 17e79ee3e021..a4a941cde866 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,6 +29,7 @@ #include "file-item.h" #include "file.h" #include "super.h" +#include "relocation.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -3066,6 +3067,11 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) struct rb_node *node; bool ret = true; + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && + block_group->identity_remap_count == 0) { + return true; + } + spin_lock(&ctl->tree_lock); node = rb_first(&ctl->free_space_offset); @@ -3834,6 +3840,33 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, return ret; } +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + int ret = 0; + u64 bytes, trimmed; + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); + u64 end = btrfs_block_group_end(bg); + + bytes = end - bg->discard_cursor; + + if (max_discard_size && + bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) + bytes = max_discard_size; + + ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed, false); + if (ret) + return; + + bg->discard_cursor += trimmed; + + if (bg->discard_cursor < end) + return; + + btrfs_complete_bg_remapping(bg); +} + /* * If we break out of trimming a bitmap prematurely, we should reset the * trimming bit. In a rather contrived case, it's possible to race here so diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 9f1dbfdee8ca..33fc3b245648 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -166,6 +166,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async); +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg); bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active); From 2aef934b56b3ae07c292831cf9bf6bafcaaa005e Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Wed, 7 Jan 2026 14:09:17 +0000 Subject: [PATCH 067/137] btrfs: populate fully_remapped_bgs_list on mount Add a function btrfs_populate_fully_remapped_bgs_list() which gets called on mount, which looks for fully remapped block groups (i.e. identity_remap_count == 0) which haven't yet had their chunk stripes and device extents removed. This happens when a filesystem is unmounted while async discard has not yet finished, as otherwise the data range occupied by the chunk stripes would be permanently unusable. Reviewed-by: Boris Burkov Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 75 +++++++++++++++++++++++++++++++++++++ fs/btrfs/block-group.h | 2 + fs/btrfs/disk-io.c | 8 ++++ fs/btrfs/free-space-cache.c | 18 +++++++++ fs/btrfs/relocation.c | 4 ++ 5 files changed, 107 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index cfc1e363e3ec..eb8a289663d0 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4794,6 +4794,10 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + spin_lock(&bg->lock); + set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); + spin_unlock(&bg->lock); + btrfs_discard_queue_work(&fs_info->discard_ctl, bg); } else { spin_lock(&fs_info->unused_bgs_lock); @@ -4811,3 +4815,74 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, spin_unlock(&fs_info->unused_bgs_lock); } } + +/* + * Compare the block group and chunk trees, and find any fully-remapped block + * groups which haven't yet had their chunk stripes and device extents removed, + * and put them on the fully_remapped_bgs list so this gets done. + * + * This happens when a block group becomes fully remapped, i.e. its last + * identity mapping is removed, and the volume is unmounted before async + * discard has finished. It's important this gets done as until it is the + * chunk's stripes are dead space. + */ +int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node_bg, *node_chunk; + + node_bg = rb_first_cached(&fs_info->block_group_cache_tree); + node_chunk = rb_first_cached(&fs_info->mapping_tree); + + while (node_bg && node_chunk) { + struct btrfs_block_group *bg; + struct btrfs_chunk_map *map; + + bg = rb_entry(node_bg, struct btrfs_block_group, cache_node); + map = rb_entry(node_chunk, struct btrfs_chunk_map, rb_node); + + ASSERT(bg->start == map->start); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) + goto next; + + if (bg->identity_remap_count != 0) + goto next; + + if (map->num_stripes == 0) + goto next; + + spin_lock(&fs_info->unused_bgs_lock); + + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + } else { + list_move_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); + } + + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Ideally we'd want to call btrfs_discard_queue_work() here, + * but it'd do nothing as the discard worker hasn't been + * started yet. + * + * The block group will get added to the discard list when + * btrfs_handle_fully_remapped_bgs() gets called, when we + * commit the first transaction. + */ + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + spin_lock(&bg->lock); + set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); + spin_unlock(&bg->lock); + } + +next: + node_bg = rb_next(node_bg); + node_chunk = rb_next(node_chunk); + } + + ASSERT(!node_bg && !node_chunk); + + return 0; +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 29d6e682accd..c03e04292900 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -94,6 +94,7 @@ enum btrfs_block_group_flags { */ BLOCK_GROUP_FLAG_NEW, BLOCK_GROUP_FLAG_FULLY_REMAPPED, + BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, }; enum btrfs_caching_type { @@ -418,5 +419,6 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, struct btrfs_trans_handle *trans); +int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 627282613eee..32fffb0557e5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3601,6 +3601,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { + ret = btrfs_populate_fully_remapped_bgs_list(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to populate fully_remapped_bgs list: %d", ret); + goto fail_sysfs; + } + } + btrfs_zoned_reserve_data_reloc_bg(fs_info); btrfs_free_zone_cache(fs_info); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a4a941cde866..af5f57bd44e6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3068,6 +3068,7 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) bool ret = true; if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && + !test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &block_group->runtime_flags) && block_group->identity_remap_count == 0) { return true; } @@ -3849,6 +3850,23 @@ void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg) const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); u64 end = btrfs_block_group_end(bg); + if (!test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags)) { + bg->discard_cursor = end; + + if (bg->used == 0) { + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&bg->bg_list)) { + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + } + spin_unlock(&fs_info->unused_bgs_lock); + + btrfs_mark_bg_unused(bg); + } + + return; + } + bytes = end - bg->discard_cursor; if (max_discard_size && diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3f8017ad0033..fcd0a2ba3554 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4743,6 +4743,10 @@ int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, btrfs_remove_bg_from_sinfo(bg); + spin_lock(&bg->lock); + clear_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); + spin_unlock(&bg->lock); + ret = remove_chunk_stripes(trans, chunk_map, path); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); From 57a4a863cda8b02dc0d46a36ec5cd5f86993b7aa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 19 Jan 2026 08:17:50 +0100 Subject: [PATCH 068/137] btrfs: remove bogus NULL checks in __btrfs_write_out_cache() Dan reported a new smatch warning in free-space-cache.c: New smatch warnings: fs/btrfs/free-space-cache.c:1207 write_pinned_extent_entries() warn: variable dereferenced before check 'block_group' (see line 1203) But the check if the block_group pointer is NULL is bogus, because to get to this point block_group::io_ctl has already been dereferenced further up the call-chain when calling __btrfs_write_out_cache() from btrfs_write_out_cache(). Remove the bogus checks for block_group == NULL in __btrfs_write_out_cache() and it's siblings. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202601170636.WsePMV5H-lkp@intel.com/ Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index af5f57bd44e6..28132b6d8f88 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1080,7 +1080,7 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) { + if (!list_empty(&block_group->cluster_list)) { cluster = list_first_entry(&block_group->cluster_list, struct btrfs_free_cluster, block_group_list); } @@ -1204,9 +1204,6 @@ static noinline_for_stack int write_pinned_extent_entries( struct extent_io_tree *unpin = NULL; int ret; - if (!block_group) - return 0; - /* * We want to add any pinned extents to our free space cache * so we don't leak the space @@ -1394,7 +1391,7 @@ static int __btrfs_write_out_cache(struct inode *inode, if (ret) return ret; - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { down_write(&block_group->data_rwsem); spin_lock(&block_group->lock); if (block_group->delalloc_bytes) { @@ -1466,7 +1463,7 @@ static int __btrfs_write_out_cache(struct inode *inode, goto out_nospc; } - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) up_write(&block_group->data_rwsem); /* * Release the pages and unlock the extent, we will flush @@ -1501,7 +1498,7 @@ static int __btrfs_write_out_cache(struct inode *inode, cleanup_write_cache_enospc(inode, io_ctl, &cached_state); out_unlock: - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) up_write(&block_group->data_rwsem); out: From 4ac81c381102bebf09a47946b343d70ed455c646 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 16 Jan 2026 10:24:06 +0000 Subject: [PATCH 069/137] btrfs: use the btrfs_block_group_end() helper everywhere We have a helper to calculate a block group's exclusive end offset, but we only use it in some places. Update every site that open codes the calculation to use the helper. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 21 ++++++++-------- fs/btrfs/extent-tree.c | 9 ++++--- fs/btrfs/free-space-cache.c | 8 +++---- fs/btrfs/free-space-tree.c | 33 ++++++++++---------------- fs/btrfs/scrub.c | 9 +++---- fs/btrfs/tests/free-space-tree-tests.c | 4 ++-- fs/btrfs/zoned.c | 9 +++---- 7 files changed, 43 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index eb8a289663d0..4fc4d49910bf 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -239,7 +239,7 @@ static struct btrfs_block_group *block_group_cache_tree_search( while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); - end = cache->start + cache->length - 1; + end = btrfs_block_group_end(cache) - 1; start = cache->start; if (bytenr < start) { @@ -292,7 +292,7 @@ struct btrfs_block_group *btrfs_next_block_group( /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->start + cache->length; + const u64 next_bytenr = btrfs_block_group_end(cache); read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); @@ -595,7 +595,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; u64 search_offset; - u64 search_end = block_group->start + block_group->length; + const u64 search_end = btrfs_block_group_end(block_group); BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -711,6 +711,7 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group = caching_ctl->block_group; + const u64 block_group_end = btrfs_block_group_end(block_group); struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; BTRFS_PATH_AUTO_FREE(path); @@ -807,7 +808,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) continue; } - if (key.objectid >= block_group->start + block_group->length) + if (key.objectid >= block_group_end) break; if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -836,9 +837,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->slots[0]++; } - ret = btrfs_add_new_free_space(block_group, last, - block_group->start + block_group->length, - NULL); + ret = btrfs_add_new_free_space(block_group, last, block_group_end, NULL); out: return ret; } @@ -846,7 +845,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_DIRTY, NULL); + btrfs_block_group_end(bg) - 1, EXTENT_DIRTY, NULL); } static noinline void caching_thread(struct btrfs_work *work) @@ -2267,7 +2266,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) while (nr--) { u64 len = min_t(u64, stripe_len, - cache->start + cache->length - logical[nr]); + btrfs_block_group_end(cache) - logical[nr]); cache->bytes_super += len; ret = btrfs_set_extent_bit(&fs_info->excluded_extents, @@ -2470,7 +2469,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, } else if (cache->used == 0 && cache->remap_bytes == 0) { cache->cached = BTRFS_CACHE_FINISHED; ret = btrfs_add_new_free_space(cache, cache->start, - cache->start + cache->length, NULL); + btrfs_block_group_end(cache), NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; @@ -3763,7 +3762,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, return -ENOENT; /* An extent can not span multiple block groups. */ - ASSERT(bytenr + num_bytes <= cache->start + cache->length); + ASSERT(bytenr + num_bytes <= btrfs_block_group_end(cache)); space_info = cache->space_info; factor = btrfs_bg_type_to_factor(cache->flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6fab7765057e..bd4d134a3380 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2784,8 +2784,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 len; bool readonly; - if (!cache || - start >= cache->start + cache->length) { + if (!cache || start >= btrfs_block_group_end(cache)) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; @@ -2801,7 +2800,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, empty_cluster <<= 1; } - len = cache->start + cache->length - start; + len = btrfs_block_group_end(cache) - start; len = min(len, end + 1 - start); if (return_free_space) @@ -4683,7 +4682,7 @@ static noinline int find_free_extent(struct btrfs_root *root, /* move on to the next group */ if (ffe_ctl->search_start + ffe_ctl->num_bytes > - block_group->start + block_group->length) { + btrfs_block_group_end(block_group)) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->num_bytes); @@ -6651,7 +6650,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) } start = max(range->start, cache->start); - end = min(range_end, cache->start + cache->length); + end = min(range_end, btrfs_block_group_end(cache)); if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 28132b6d8f88..8dd15865ab0a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1201,6 +1201,7 @@ static noinline_for_stack int write_pinned_extent_entries( int *entries) { u64 start, extent_start, extent_end, len; + const u64 block_group_end = btrfs_block_group_end(block_group); struct extent_io_tree *unpin = NULL; int ret; @@ -1215,19 +1216,18 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; - while (start < block_group->start + block_group->length) { + while (start < block_group_end) { if (!btrfs_find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ - if (extent_start >= block_group->start + block_group->length) + if (extent_start >= block_group_end) return 0; extent_start = max(extent_start, start); - extent_end = min(block_group->start + block_group->length, - extent_end + 1); + extent_end = min(block_group_end, extent_end + 1); len = extent_end - extent_start; *entries += 1; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 96d52c031977..ecddfca92b2b 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, return 0; start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); key.objectid = end - 1; key.type = (u8)-1; @@ -358,7 +358,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, return 0; start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); key.objectid = end - 1; key.type = (u8)-1; @@ -665,7 +665,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * Read the bit for the block immediately after the extent of space if * that block is within the block group. */ - if (end < block_group->start + block_group->length) { + if (end < btrfs_block_group_end(block_group)) { /* The next block may be in the next bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (end >= key.objectid + key.offset) { @@ -938,7 +938,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, right: /* Search for a neighbor on the right. */ - if (end == block_group->start + block_group->length) + if (end == btrfs_block_group_end(block_group)) goto insert; key.objectid = end; key.type = (u8)-1; @@ -1104,7 +1104,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * highest, block group). */ start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -1477,7 +1477,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, } start = block_group->start; - end = block_group->start + block_group->length; + end = btrfs_block_group_end(block_group); key.objectid = end - 1; key.type = (u8)-1; @@ -1530,24 +1530,21 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group *block_group; - struct btrfs_fs_info *fs_info; + struct btrfs_block_group *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root; struct btrfs_key key; bool prev_bit_set = false; /* Initialize to silence GCC. */ u64 extent_start = 0; - u64 end, offset; + const u64 end = btrfs_block_group_end(block_group); + u64 offset; u64 total_found = 0; u32 extent_count = 0; int ret; - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; root = btrfs_free_space_root(block_group); - end = block_group->start + block_group->length; - while (1) { ret = btrfs_next_item(root, path); if (ret < 0) @@ -1613,21 +1610,17 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group *block_group; - struct btrfs_fs_info *fs_info; + struct btrfs_block_group *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root; struct btrfs_key key; - u64 end; + const u64 end = btrfs_block_group_end(block_group); u64 total_found = 0; u32 extent_count = 0; int ret; - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; root = btrfs_free_space_root(block_group); - end = block_group->start + block_group->length; - while (1) { u64 space_added; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2372084cf6c5..0bd4aebe1687 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1688,9 +1688,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, scrub_stripe_reset_bitmaps(stripe); /* The range must be inside the bg. */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, + ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", - bg->start, logical_start, logical_end, bg->start + bg->length); + bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); ret = find_first_extent_item(extent_root, extent_path, logical_start, logical_len); @@ -2319,7 +2319,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, int ret = 0; /* The range must be inside the bg */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { @@ -2411,12 +2411,13 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, const u64 logical_increment = simple_stripe_full_stripe_len(map); const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); const u64 orig_physical = map->stripes[stripe_index].physical; + const u64 end = btrfs_block_group_end(bg); const int mirror_num = simple_stripe_mirror_num(map, stripe_index); u64 cur_logical = orig_logical; u64 cur_physical = orig_physical; int ret = 0; - while (cur_logical < bg->start + bg->length) { + while (cur_logical < end) { /* * Inside each stripe, RAID0 is just SINGLE, and RAID10 is * just RAID1, so we can reuse scrub_simple_mirror() to scrub diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index c8822edd32e2..8dee057f41fd 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -49,7 +49,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { if (path->slots[0] != 0) goto invalid; - end = cache->start + cache->length; + end = btrfs_block_group_end(cache); i = 0; while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -216,7 +216,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans, int ret; ret = __btrfs_remove_from_free_space_tree(trans, cache, path, - cache->start + cache->length - alignment, + btrfs_block_group_end(cache) - alignment, alignment); if (ret) { test_err("could not remove free space"); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 2e861eef5cd8..d6a2480d5dc1 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1231,6 +1231,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; + const u64 bg_end = btrfs_block_group_end(cache); int ret; u64 length; @@ -1253,7 +1254,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!path) return -ENOMEM; - key.objectid = cache->start + cache->length; + key.objectid = bg_end; key.type = 0; key.offset = 0; @@ -1282,7 +1283,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, length = fs_info->nodesize; if (unlikely(!(found_key.objectid >= cache->start && - found_key.objectid + length <= cache->start + cache->length))) { + found_key.objectid + length <= bg_end))) { return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; @@ -2028,7 +2029,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group) { if (block_group->start > eb->start || - block_group->start + block_group->length <= eb->start) { + btrfs_block_group_end(block_group) <= eb->start) { btrfs_put_block_group(block_group); block_group = NULL; ctx->zoned_bg = NULL; @@ -2248,7 +2249,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - const u64 end = block_group->start + block_group->length; + const u64 end = btrfs_block_group_end(block_group); struct extent_buffer *eb; unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); From ea7ab405c55b6ac4b5c3e61ef37cf697067e3c71 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 16 Jan 2026 10:57:00 +0000 Subject: [PATCH 070/137] btrfs: use the btrfs_extent_map_end() helper everywhere We have a helper to calculate an extent map's exclusive end offset, but we only use it in some places. Update every site that open codes the calculation to use the helper. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/defrag.c | 5 +++-- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file.c | 9 +++++---- fs/btrfs/inode.c | 2 +- fs/btrfs/tests/inode-tests.c | 32 ++++++++++++++++---------------- fs/btrfs/tree-log.c | 2 +- 7 files changed, 28 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4323d4172c7b..4c6298cf01b2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -519,7 +519,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, folio_put(folio); break; } - add_size = min(em->start + em->len, page_end + 1) - cur; + add_size = min(btrfs_extent_map_end(em), page_end + 1) - cur; btrfs_free_extent_map(em); btrfs_unlock_extent(tree, cur, page_end, NULL); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index bcc6656ad034..ecf05cd64696 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -792,10 +792,11 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_map *next; + const u64 em_end = btrfs_extent_map_end(em); bool ret = false; /* This is the last extent */ - if (em->start + em->len >= i_size_read(inode)) + if (em_end >= i_size_read(inode)) return false; /* @@ -804,7 +805,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * one will not be a target. * This will just cause extra IO without really reducing the fragments. */ - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + next = defrag_lookup_extent(inode, em_end, newer_than, locked); /* No more em or hole */ if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) goto out; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f804131b1c78..dfc17c292217 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -970,7 +970,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, { const u64 ra_pos = readahead_pos(ractl); const u64 ra_end = ra_pos + readahead_length(ractl); - const u64 em_end = em->start + em->len; + const u64 em_end = btrfs_extent_map_end(em); /* No expansion for holes and inline extents. */ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5d47cff5af42..1759776d2d57 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2195,10 +2195,11 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->disk_bytenr == EXTENT_MAP_HOLE) { + const u64 em_end = btrfs_extent_map_end(em); + ret = 1; - *len = em->start + em->len > *start + *len ? - 0 : *start + *len - em->start - em->len; - *start = em->start + em->len; + *len = (em_end > *start + *len) ? 0 : (*start + *len - em_end); + *start = em_end; } btrfs_free_extent_map(em); return ret; @@ -2947,7 +2948,7 @@ static int btrfs_zero_range(struct inode *inode, * new prealloc extent, so that we get a larger contiguous disk extent. */ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { - const u64 em_end = em->start + em->len; + const u64 em_end = btrfs_extent_map_end(em); if (em_end >= offset + len) { /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 691aa5119c0b..fa110827aaab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7161,7 +7161,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, read_unlock(&em_tree->lock); if (em) { - if (em->start > start || em->start + em->len <= start) + if (em->start > start || btrfs_extent_map_end(em) <= start) btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) btrfs_free_extent_map(em); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a4c2b7748b95..6bd17d059ae6 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -313,7 +313,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * unless we have a page for it to write into. Maybe we should change * this? */ - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -335,7 +335,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Regular extent */ @@ -362,7 +362,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* The next 3 are split extents */ @@ -391,7 +391,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) } disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -446,7 +446,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Prealloc extent */ @@ -474,7 +474,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* The next 3 are a half written prealloc extent */ @@ -504,7 +504,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) } disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -536,7 +536,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -569,7 +569,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Now for the compressed extent */ @@ -602,7 +602,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* Split compressed extent */ @@ -637,7 +637,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) } disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -663,7 +663,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -697,7 +697,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); /* A hole between regular extents but no hole extent */ @@ -724,7 +724,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); @@ -756,7 +756,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - offset = em->start + em->len; + offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6cffcf0c3e7a..e1bd03ebfd98 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5160,7 +5160,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ctx->logged_before) { drop_args.path = path; drop_args.start = em->start; - drop_args.end = em->start + em->len; + drop_args.end = btrfs_extent_map_end(em); drop_args.replace_extent = true; drop_args.extent_item_size = sizeof(fi); ret = btrfs_drop_extents(trans, log, inode, &drop_args); From de62f138f95eacd71121d9fdc4815cb90a56ccbd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 20 Jan 2026 09:19:46 +0100 Subject: [PATCH 071/137] btrfs: don't pass io_ctl to __btrfs_write_out_cache() There's no need to pass both the block_group and block_group::io_ctl to __btrfs_write_out_cache(). Remove passing io_ctl to __btrfs_write_out_cache() and dereference it inside __btrfs_write_out_cache(). Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8dd15865ab0a..6fb813cc224b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1372,9 +1372,9 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, - struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { + struct btrfs_io_ctl *io_ctl = &block_group->io_ctl; struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; @@ -1534,8 +1534,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(inode, ctl, block_group, - &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", From 19eff93dc738e8afaa59cb374b44bb5a162e6c2d Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Wed, 14 Jan 2026 11:47:02 +0800 Subject: [PATCH 072/137] btrfs: fix periodic reclaim condition Problems with current implementation: 1. reclaimable_bytes is signed while chunk_sz is unsigned, causing negative reclaimable_bytes to trigger reclaim unexpectedly 2. The "space must be freed between scans" assumption breaks the two-scan requirement: first scan marks block groups, second scan reclaims them. Without the second scan, no reclamation occurs. Instead, track actual reclaim progress: pause reclaim when block groups will be reclaimed, and resume only when progress is made. This ensures reclaim continues until no further progress can be made. And resume periodic reclaim when there's enough free space. And we take care if reclaim is making any progress now, so it's unnecessary to set periodic_reclaim_ready to false when failed to reclaim a block group. Fixes: 813d4c6422516 ("btrfs: prevent pathological periodic reclaim loops") CC: stable@vger.kernel.org # 6.12+ Suggested-by: Boris Burkov Reviewed-by: Boris Burkov Signed-off-by: Sun YangKai Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 ++++-- fs/btrfs/space-info.c | 21 ++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4fc4d49910bf..6c6133584196 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1889,6 +1889,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) while (!list_empty(&fs_info->reclaim_bgs)) { u64 used; u64 reserved; + u64 old_total; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1954,6 +1955,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_unlock(&bg->lock); + old_total = space_info->total_bytes; spin_unlock(&space_info->lock); /* @@ -2006,14 +2008,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; - if (READ_ONCE(space_info->periodic_reclaim)) - space_info->periodic_reclaim_ready = false; spin_unlock(&space_info->lock); } spin_lock(&space_info->lock); space_info->reclaim_count++; space_info->reclaim_bytes += used; space_info->reclaim_bytes += reserved; + if (space_info->total_bytes < old_total) + btrfs_set_periodic_reclaim_ready(space_info, true); spin_unlock(&space_info->lock); next: diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ebe97d6d67d3..bc493243f777 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -2098,11 +2098,11 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) +static bool do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; - bool try_again = true; + bool will_reclaim = false; bool urgent; spin_lock(&space_info->lock); @@ -2120,7 +2120,7 @@ static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) spin_lock(&bg->lock); thresh = mult_perc(bg->length, thresh_pct); if (bg->used < thresh && bg->reclaim_mark) { - try_again = false; + will_reclaim = true; reclaim = true; } bg->reclaim_mark++; @@ -2137,12 +2137,13 @@ static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) * If we have any staler groups, we don't touch the fresher ones, but if we * really need a block group, do take a fresh one. */ - if (try_again && urgent) { - try_again = false; + if (!will_reclaim && urgent) { + urgent = false; goto again; } up_read(&space_info->groups_sem); + return will_reclaim; } void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) @@ -2152,7 +2153,8 @@ void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s6 lockdep_assert_held(&space_info->lock); space_info->reclaimable_bytes += bytes; - if (space_info->reclaimable_bytes >= chunk_sz) + if (space_info->reclaimable_bytes > 0 && + space_info->reclaimable_bytes >= chunk_sz) btrfs_set_periodic_reclaim_ready(space_info, true); } @@ -2179,7 +2181,6 @@ static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) spin_lock(&space_info->lock); ret = space_info->periodic_reclaim_ready; - btrfs_set_periodic_reclaim_ready(space_info, false); spin_unlock(&space_info->lock); return ret; @@ -2193,8 +2194,10 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) list_for_each_entry(space_info, &fs_info->space_info, list) { if (!btrfs_should_periodic_reclaim(space_info)) continue; - for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) - do_reclaim_sweep(space_info, raid); + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { + if (do_reclaim_sweep(space_info, raid)) + btrfs_set_periodic_reclaim_ready(space_info, false); + } } } From 4b7ecd0984e34262430b9db7efbfaf293b4b4d3c Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Wed, 14 Jan 2026 11:47:03 +0800 Subject: [PATCH 073/137] btrfs: consolidate reclaim readiness checks in btrfs_should_reclaim() Move the filesystem state validation from btrfs_reclaim_bgs_work() into btrfs_should_reclaim() to centralize the reclaim eligibility logic. Since it is the only caller of btrfs_should_reclaim(), there's no functional change. Reviewed-by: Boris Burkov Reviewed-by: Johannes Thumshirn Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6c6133584196..022c77f8d0e2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1822,6 +1822,12 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return false; + + if (btrfs_fs_closing(fs_info)) + return false; + if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; @@ -1856,12 +1862,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) struct btrfs_space_info *space_info; LIST_HEAD(retry_list); - if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) - return; - - if (btrfs_fs_closing(fs_info)) - return; - if (!btrfs_should_reclaim(fs_info)) return; From 37cc07cab7dc311f2b7aeaaa7598face53eddcab Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 20 Jan 2026 10:30:08 +1030 Subject: [PATCH 074/137] btrfs: lzo: use folio_iter to handle lzo_decompress_bio() Currently lzo_decompress_bio() is using compressed_bio->compressed_folios[] array to grab each compressed folio. This is making the code much easier to read, as we only need to maintain a single iterator, @cur_in, and can easily grab any random folio using @cur_in >> min_folio_shift as an index. However lzo_decompress_bio() itself is ensured to only advance to the next folio at one time, and compressed_folios[] is just a pointer to each folio of the compressed bio, thus we have no real random access requirement for lzo_decompress_bio(). Replace the compressed_folios[] access by a helper, get_current_folio(), which uses folio_iter and an external folio counter to properly switch the folio when needed. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 4758f66da449..4024ce416971 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -310,23 +310,43 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, return ret; } +static struct folio *get_current_folio(struct compressed_bio *cb, struct folio_iter *fi, + u32 *cur_folio_index, u32 cur_in) +{ + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + + ASSERT(cur_folio_index); + + /* Need to switch to the next folio. */ + if (cur_in >> min_folio_shift != *cur_folio_index) { + /* We can only do the switch one folio a time. */ + ASSERT(cur_in >> min_folio_shift == *cur_folio_index + 1); + + bio_next_folio(fi, &cb->bbio.bio); + (*cur_folio_index)++; + } + return fi->folio; +} + /* * Copy the compressed segment payload into @dest. * * For the payload there will be no padding, just need to do page switching. */ static void copy_compressed_segment(struct compressed_bio *cb, + struct folio_iter *fi, u32 *cur_folio_index, char *dest, u32 len, u32 *cur_in) { - struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; - u32 copy_len = min_t(u32, orig_in + len - *cur_in, - folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); + struct folio *cur_folio = get_current_folio(cb, fi, cur_folio_index, *cur_in); + u32 copy_len; + ASSERT(cur_folio); + copy_len = min_t(u32, orig_in + len - *cur_in, + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); ASSERT(copy_len); memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, @@ -341,7 +361,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + struct folio_iter fi; char *kaddr; int ret; /* Compressed data length, can be unaligned */ @@ -350,8 +370,15 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) u32 cur_in = 0; /* Bytes decompressed so far */ u32 cur_out = 0; + /* The current folio index number inside the bio. */ + u32 cur_folio_index = 0; - kaddr = kmap_local_folio(cb->compressed_folios[0], 0); + bio_first_folio(&fi, &cb->bbio.bio, 0); + /* There must be a compressed folio and matches the sectorsize. */ + if (unlikely(!fi.folio)) + return -EINVAL; + ASSERT(folio_size(fi.folio) == sectorsize); + kaddr = kmap_local_folio(fi.folio, 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -388,7 +415,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; + cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in); ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); @@ -410,7 +437,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } /* Copy the compressed segment payload into workspace */ - copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); + copy_compressed_segment(cb, &fi, &cur_folio_index, workspace->cbuf, + seg_len, &cur_in); /* Decompress the data */ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, From 20c993134e105368b9165cb9af8d8c1c2ac59a2d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 20 Jan 2026 10:30:09 +1030 Subject: [PATCH 075/137] btrfs: zlib: use folio_iter to handle zlib_decompress_bio() Currently zlib_decompress_bio() is using compressed_bio->compressed_folios[] array to grab each compressed folio. However cb->compressed_folios[] is just a pointer to each folio of the compressed bio, meaning we can just replace the compressed_folios[] array by just grabbing the folio inside the compressed bio. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 10ed48d4a846..a004aa4ee9e2 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -338,18 +338,23 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); + struct folio_iter fi; const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret = 0, ret2; int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); unsigned long buf_start; - struct folio **folios_in = cb->compressed_folios; - data_in = kmap_local_folio(folios_in[folio_in_index], 0); + bio_first_folio(&fi, &cb->bbio.bio, 0); + + /* We must have at least one folio here, that has the correct size. */ + if (unlikely(!fi.folio)) + return -EINVAL; + ASSERT(folio_size(fi.folio) == min_folio_size); + + data_in = kmap_local_folio(fi.folio, 0); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); workspace->strm.total_in = 0; @@ -404,12 +409,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap_local(data_in); - folio_in_index++; - if (folio_in_index >= total_folios_in) { + bio_next_folio(&fi, &cb->bbio.bio); + if (!fi.folio) { data_in = NULL; break; } - data_in = kmap_local_folio(folios_in[folio_in_index], 0); + ASSERT(folio_size(fi.folio) == min_folio_size); + data_in = kmap_local_folio(fi.folio, 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, min_folio_size); From 1914b94231e98280de4ec3a7f10e7abfd928c649 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 20 Jan 2026 10:30:10 +1030 Subject: [PATCH 076/137] btrfs: zstd: use folio_iter to handle zstd_decompress_bio() Currently zstd_decompress_bio() is using compressed_bio->compressed_folios[] array to grab each compressed folio. However cb->compressed_folios[] is just a pointer to each folio of the compressed bio, meaning we can just replace the compressed_folios[] array by just grabbing the folio inside the compressed bio. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zstd.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index c9cddcfa337b..7fad1e299c7a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -589,7 +589,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); - struct folio **folios_in = cb->compressed_folios; + struct folio_iter fi; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; @@ -600,6 +600,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; unsigned long total_out = 0; + bio_first_folio(&fi, &cb->bbio.bio, 0); + if (unlikely(!fi.folio)) + return -EINVAL; + ASSERT(folio_size(fi.folio) == blocksize); + stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { @@ -612,7 +617,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); + workspace->in_buf.src = kmap_local_folio(fi.folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); @@ -660,8 +665,9 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } srclen -= min_folio_size; - workspace->in_buf.src = - kmap_local_folio(folios_in[folio_in_index], 0); + bio_next_folio(&fi, &cb->bbio.bio); + ASSERT(fi.folio); + workspace->in_buf.src = kmap_local_folio(fi.folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); } From 17078525e5be48381a35068535feb9adb36635f3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 11:42:43 +0000 Subject: [PATCH 077/137] btrfs: make load_block_group_size_class() return void There's no point in returning anything since determining and setting a size class for a block group is an optimization, not something critical. The only caller of load_block_group_size_class() (the caching thread) does not do anything with the return value anyway, exactly because having a size class is just an optimization and it can always be set later when adding reserved bytes to a block group (btrfs_add_reserved_bytes()). Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 022c77f8d0e2..20dd4a3d9961 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -673,27 +673,29 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ * 3, we can either read every file extent, or admit that this is best effort * anyway and try to stay fast. * - * Returns: 0 on success, negative error code on error. + * No errors are returned since failing to determine the size class is not a + * critical error, size classes are just an optimization. */ -static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, - struct btrfs_block_group *block_group) +static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl, + struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; u64 min_size = block_group->length; enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; - int ret; if (!btrfs_block_group_should_use_size_class(block_group)) - return 0; + return; lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) { + int ret; + ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); if (ret < 0) - goto out; + return; if (ret > 0) continue; min_size = min_t(u64, min_size, key.offset); @@ -704,8 +706,6 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl block_group->size_class = size_class; spin_unlock(&block_group->lock); } -out: - return ret; } static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) From e46a9f84bfe8f3662b3ae8ad183d6067888b5469 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 11:55:14 +0000 Subject: [PATCH 078/137] btrfs: allocate path on stack in load_block_group_size_class() Instead of allocating and freeing a path in every iteration of load_block_group_size_class(), through its helper function sample_block_group_extent_item(), allocate the path in the former and pass it to the later. The path is allocated on stack since it's short and we are in a workqueue context so there's not much stack usage. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 20dd4a3d9961..fe42e0e2b597 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -579,24 +579,24 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, * @index: the integral step through the block group to grab from * @max_index: the granularity of the sampling * @key: return value parameter for the item we find + * @path: path to use for searching in the extent tree * * Pre-conditions on indices: * 0 <= index <= max_index * 0 < max_index * - * Returns: 0 on success, 1 if the search didn't yield a useful item, negative - * error code on error. + * Returns: 0 on success, 1 if the search didn't yield a useful item. */ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group, int index, int max_index, - struct btrfs_key *found_key) + struct btrfs_key *found_key, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; u64 search_offset; const u64 search_end = btrfs_block_group_end(block_group); - BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -606,17 +606,9 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET)); - path->skip_locking = true; - path->search_commit_root = true; - path->reada = READA_FORWARD; - search_offset = index * div_u64(block_group->length, max_index); search_key.objectid = block_group->start + search_offset; search_key.type = BTRFS_EXTENT_ITEM_KEY; @@ -679,23 +671,37 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group) { + BTRFS_PATH_AUTO_RELEASE(path); struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; u64 min_size = block_group->length; enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; + /* + * Since we run in workqueue context, we allocate the path on stack to + * avoid memory allocation failure, as the stack in a work queue task + * is not deep. + */ + ASSERT(current_work() == &caching_ctl->work.normal_work); + if (!btrfs_block_group_should_use_size_class(block_group)) return; + path.skip_locking = true; + path.search_commit_root = true; + path.reada = READA_FORWARD; + lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) { int ret; - ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); + ret = sample_block_group_extent_item(caching_ctl, block_group, + i, 5, &key, &path); if (ret < 0) return; + btrfs_release_path(&path); if (ret > 0) continue; min_size = min_t(u64, min_size, key.offset); From 0bf63d385f2c6e8bcf9133b35decbffe80ed7da9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 12:02:23 +0000 Subject: [PATCH 079/137] btrfs: don't pass block group argument to load_block_group_size_class() There's no need to pass the block group since we can extract it from the given caching control structure. Same goes for its helper function sample_block_group_extent_item(). Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fe42e0e2b597..635143a8eb07 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -575,7 +575,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, /* * Get an arbitrary extent item index / max_index through the block group * - * @block_group the block group to sample from + * @caching_ctl the caching control containing the block group to sample from * @index: the integral step through the block group to grab from * @max_index: the granularity of the sampling * @key: return value parameter for the item we find @@ -588,11 +588,11 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, * Returns: 0 on success, 1 if the search didn't yield a useful item. */ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, - struct btrfs_block_group *block_group, int index, int max_index, struct btrfs_key *found_key, struct btrfs_path *path) { + struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; u64 search_offset; @@ -668,10 +668,10 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ * No errors are returned since failing to determine the size class is not a * critical error, size classes are just an optimization. */ -static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl, - struct btrfs_block_group *block_group) +static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl) { BTRFS_PATH_AUTO_RELEASE(path); + struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; @@ -697,8 +697,7 @@ static void load_block_group_size_class(struct btrfs_caching_control *caching_ct for (i = 0; i < 5; ++i) { int ret; - ret = sample_block_group_extent_item(caching_ctl, block_group, - i, 5, &key, &path); + ret = sample_block_group_extent_item(caching_ctl, i, 5, &key, &path); if (ret < 0) return; btrfs_release_path(&path); @@ -868,7 +867,7 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); - load_block_group_size_class(caching_ctl, block_group); + load_block_group_size_class(caching_ctl); if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { From 954f3217f60277ea035d747275df0ec5c68ad65c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 12:09:20 +0000 Subject: [PATCH 080/137] btrfs: assert block group is locked in btrfs_use_block_group_size_class() It's supposed to be called with the block group locked, in order to read and set its size_class member, so assert it's locked. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 635143a8eb07..11ed303c6640 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4755,6 +4755,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class) { + lockdep_assert_held(&bg->lock); ASSERT(size_class != BTRFS_BG_SZ_NONE); /* The new allocation is in the right size class, do nothing */ From 7c2830f00c3e086292c1ee9f27b61efaf8e76c9a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 1 Nov 2025 10:22:16 +1030 Subject: [PATCH 081/137] btrfs: fallback to buffered IO if the data profile has duplication [BACKGROUND] Inspired by a recent kernel bug report, which is related to direct IO buffer modification during writeback, that leads to contents mismatch of different RAID1 mirrors. [CAUSE AND PROBLEMS] The root cause is exactly the same explained in commit 968f19c5b1b7 ("btrfs: always fallback to buffered write if the inode requires checksum"), that we can not trust direct IO buffer which can be modified halfway during writeback. Unlike data checksum verification, if this happened on inodes without data checksum but has the data has extra mirrors, it will lead to stealth data mismatch on different mirrors. This will be way harder to detect without data checksum. Furthermore for RAID56, we can even have data without checksum and data with checksum mixed inside the same full stripe. In that case if the direct IO buffer got changed halfway for the nodatasum part, the data with checksum immediately lost its ability to recover, e.g.: " " = Good old data or parity calculated using good old data "X" = Data modified during writeback 0 32K 64K Data 1 | | Has csum Data 2 |XXXXXXXXXXXXXXXX | No csum Parity | | In above case, the parity is calculated using data 1 (has csum, from page cache, won't change during writeback), and old data 2 (has no csum, direct IO write). After parity is calculated, but before submission to the storage, direct IO buffer of data 2 is modified, causing the range [0, 32K) of data 2 has a different content. Now all data is submitted to the storage, and the fs got fully synced. Then the device of data 1 is lost, has to be rebuilt from data 2 and parity. But since the data 2 has some modified data, and the parity is calculated using old data, the recovered data is no the same for data 1, causing data checksum mismatch. [FIX] Fix the problem by checking the data allocation profile. If our data allocation profile is either RAID0 or SINGLE, we can allow true zero-copy direct IO and the end user is fully responsible for any race. However this is not going to fix all situations, as it's still possible to race with balance where the fs got a new data profile after the data allocation profile check. But this fix should still greatly reduce the window of the original bug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=99171 Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/direct-io.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index bc7cc2d81f8f..9a63200d7a53 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -801,6 +801,8 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; unsigned int ilock_flags = 0; struct iomap_dio *dio; + const u64 data_profile = btrfs_data_alloc_profile(fs_info) & + BTRFS_BLOCK_GROUP_PROFILE_MASK; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -814,6 +816,16 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; + /* + * If our data profile has duplication (either extra mirrors or RAID56), + * we can not trust the direct IO buffer, the content may change during + * writeback and cause different contents written to different mirrors. + * + * Thus only RAID0 and SINGLE can go true zero-copy direct IO. + */ + if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0) + goto buffered; + relock: ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) From 79d51b5c7a2c64f3420ff632df67a76bf01a46ed Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 21 Jan 2026 20:42:53 +0000 Subject: [PATCH 082/137] btrfs: remove bogus root search condition in sample_block_group_extent_item() There's no need to pass the maximum between the block group's start offset and BTRFS_SUPER_INFO_OFFSET (64K) since we can't have any block groups allocated in the first megabyte, as that's reserved space. Furthermore, even if we could, the correct thing to do was to pass the block group's start offset anyway - and that's precisely what we do for block groups that happen to contain superblock mirror (the range for the super block is never marked as free and it's marked as dirty in the fs_info->excluded_extents io tree). So simplify this and get rid of that maximum expression. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 11ed303c6640..66c512bc022c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -606,8 +606,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, - BTRFS_SUPER_INFO_OFFSET)); + extent_root = btrfs_extent_root(fs_info, block_group->start); search_offset = index * div_u64(block_group->length, max_index); search_key.objectid = block_group->start + search_offset; From a8bec25e014eab671ec8a25b03d391cad3e55230 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 21 Jan 2026 20:48:35 +0000 Subject: [PATCH 083/137] btrfs: deal with missing root in sample_block_group_extent_item() In case the root does not exists, which is unexpected, btrfs_extent_root() returns NULL, but we ignore that and so if it happens we can trigger a NULL pointer dereference later. So verify if we found the root and log an error message in case it's missing. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 66c512bc022c..28aa87872f0d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -607,6 +607,12 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held_read(&fs_info->commit_root_sem); extent_root = btrfs_extent_root(fs_info, block_group->start); + if (unlikely(!extent_root)) { + btrfs_err(fs_info, + "missing extent root for block group at offset %llu", + block_group->start); + return -EUCLEAN; + } search_offset = index * div_u64(block_group->length, max_index); search_key.objectid = block_group->start + search_offset; From 571e75f4c0dba42e57496015987285ed4380d041 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 21 Jan 2026 16:35:56 +0000 Subject: [PATCH 084/137] btrfs: unfold transaction aborts in btrfs_finish_one_ordered() We have a single transaction abort that can be caused either by a failure from a call to btrfs_mark_extent_written(), if we are dealing with a write to a prealloc extent, or otherwise from a call to insert_ordered_extent_file_extent(). So when the transaction abort happens we can not know for sure which case failed. Unfold the aborts so that it's clear in case of a failure. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa110827aaab..6637e451992f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3256,19 +3256,21 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) logical_len); btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; + } } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); - if (!ret) { - clear_reserved_extent = false; - btrfs_release_delalloc_bytes(fs_info, - ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + goto out; } - } - if (unlikely(ret < 0)) { - btrfs_abort_transaction(trans, ret); - goto out; + clear_reserved_extent = false; + btrfs_release_delalloc_bytes(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, From d77b90cfe08f4cf20dbbc255f8ef13bee831be63 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 22 Jan 2026 09:37:58 +1030 Subject: [PATCH 085/137] btrfs: tests: remove invalid file extent map tests In the inode self tests, there are several problems: - Invalid file extents E.g. hole range [4K, 4K + 4) is completely invalid. Only inlined extent maps can have an unaligned ram_bytes, and even for that case, the generated extent map will use sectorsize as em->len. - Unaligned hole after inlined extent The kernel never does this by itself, the current btrfs_get_extent() will only return a single inlined extent map that covers the first block. - Incorrect numbers in the comment E.g. 12291 no matter if you add or dec 1, is not aligned to 4K. The properly number for 12K is 12288, I don't know why there is even a diff of 3, and this completely doesn't match the extent map we inserted later. - Hard-to-modify sequence in setup_file_extents() If some unfortunate person, just like me, needs to modify setup_file_extents(), good luck not screwing up the file offset. Fix them by: - Remove invalid unaligned extent maps This mostly means remove the [4K, 4K + 4) hole case. The remaining ones are already properly aligned. This slightly changes the on-disk data extent allocation, with that removed, the regular extents at [4K, 8K) and [8K , 12K) can be merged. So also add a 4K gap between those two data extents to prevent em merge. - Remove the implied hole after an inlined extent Just like what the kernel is doing for inlined extents in the real world. - Update the commit using proper numbers with 'K' suffixes Since there is no unaligned range except the first inlined one, we can always use numbers with 'K' suffixes, which is way more easier to read, and will always be aligned to 1024 at least. - Add comments in setup_file_extents() So that we're clear about the file offset for each test file extent. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tests/inode-tests.c | 96 ++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 6bd17d059ae6..b04fbcaf0a1d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -81,17 +81,20 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] - * [inline][hole but no extent][ hole ][ regular ][regular1 split] + * The numbers are using 4K fs block size as an example, the real test will scale + * all the extent maps (except the inlined one) according to the block size. * - * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] - * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] + * [ 0 - 6 ][ 6 - 4K ][ 4K - 8K ][ 8K - 12K ] + * [ inline ][ implied hole ][ regular ][ regular1 split ] * - * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] - * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] + * [ 12K - 16K ][ 16K - 24K ][ 24K - 28K ][ 28K - 32K ][ 32K - 36K ] + * [ hole ][ regular1 split ][ prealloc ][ prealloc1 ][ prealloc1 written ] * - * [69635-73731][ 73731 - 86019 ][86019-90115] - * [ regular ][ hole but no extent][ regular ] + * [ 36K - 44K ][ 44K - 52K ][ 52K - 56K ][ 56K - 60K ][ 60K - 68 K ] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1 ] + * + * [ 68K - 72K ][ 72K - 84K ][ 84K - 88K ] + * [ regular ][ hole but no extent ][ regular ] */ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { @@ -100,6 +103,8 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) u64 offset = 0; /* + * Start 0, length 6, inlined. + * * Tree-checker has strict limits on inline extents that they can only * exist at file offset 0, thus we can only have one inline file extent * at most. @@ -109,20 +114,18 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) slot++; offset = sectorsize; - /* Now another hole */ - insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, - slot); + /* Start 1 * blocksize, length 1 * blocksize, regular. */ + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4; - /* Now for a regular extent */ - insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, - disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot); - slot++; - disk_bytenr += sectorsize; - offset += sectorsize - 1; + /* We don't want the regular em merged with the next one. */ + disk_bytenr += 2 * sectorsize; + offset += sectorsize; /* + * Start 2 * blocksize, length 1 * blocksize, regular. + * * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ @@ -130,10 +133,14 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 3 * blocksize, length 1 * blocksize, regular, explicit hole. */ insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 4 * blocksize, length 2 * blocksize, regular. */ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); @@ -141,7 +148,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) offset += 2 * sectorsize; disk_bytenr += 4 * sectorsize; - /* Now for a unwritten prealloc extent */ + /* Start 6 * blocksize, length 1 * blocksize, preallocated. */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; @@ -154,6 +161,8 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) disk_bytenr += 2 * sectorsize; /* + * Start 7 * blocksize, length 1 * blocksize, prealloc. + * * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. @@ -162,11 +171,15 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; offset += sectorsize; + + /* Start 8 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 9 * blocksize, length 2 * blocksize, prealloc. */ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); @@ -174,7 +187,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) offset += 2 * sectorsize; disk_bytenr += 4 * sectorsize; - /* Now a normal compressed extent */ + /* Start 11 * blocksize, length 2 * blocksize, regular. */ insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); @@ -183,17 +196,21 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) /* No merges */ disk_bytenr += 2 * sectorsize; - /* Now a split compressed extent */ + /* Start 13 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; offset += sectorsize; + + /* Start 14 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += sectorsize; + + /* Start 15 * blocksize, length 2 * blocksize, regular. */ insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); @@ -201,12 +218,19 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) offset += 2 * sectorsize; disk_bytenr += 2 * sectorsize; - /* Now extents that have a hole but no hole extent */ + /* Start 17 * blocksize, length 1 * blocksize, regular. */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; offset += 4 * sectorsize; disk_bytenr += sectorsize; + + /* + * Start 18 * blocksize, length 3 * blocksize, implied hole (aka no + * file extent item). + * + * Start 21 * blocksize, length 1 * blocksize, regular. + */ insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } @@ -316,28 +340,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = btrfs_extent_map_end(em); btrfs_free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); - if (IS_ERR(em)) { - test_err("got an error when we shouldn't have"); - goto out; - } - if (em->disk_bytenr != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->disk_bytenr); - goto out; - } - if (em->start != offset || em->len != 4) { - test_err( - "unexpected extent wanted start %llu len 4, got start %llu len %llu", - offset, em->start, em->len); - goto out; - } - if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %u", em->flags); - goto out; - } - offset = btrfs_extent_map_end(em); - btrfs_free_extent_map(em); - /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -348,10 +350,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("expected a real extent, got %llu", em->disk_bytenr); goto out; } - if (em->start != offset || em->len != sectorsize - 1) { + if (em->start != offset || em->len != sectorsize) { test_err( - "unexpected extent wanted start %llu len 4095, got start %llu len %llu", - offset, em->start, em->len); + "unexpected extent wanted start %llu len %u, got start %llu len %llu", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { From 385c65f8274b8caff75102474c4c9e8a95e6ec4c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 22 Jan 2026 09:37:59 +1030 Subject: [PATCH 086/137] btrfs: tests: prepare extent map tests for strict alignment checks Currently the extent map self tests have the following points that will cause false alerts for the incoming strict extent map alignment checks: - Incorrect inlined extent map size Which is not following what the kernel is doing for inlined extents, as btrfs_extent_item_to_extent_map() always uses the fs block size as the length, not the ram_bytes. Fix it by using SZ_4K as extent map's length. - Incorrect btrfs_fs_info::sectorsize As we always use PAGE_SIZE, which can be values larger than 4K. Meanwhile all the immediate numbers used are based on 4K fs block size in the test case. Fix it by using fixed SZ_4K fs block size when allocating the dummy btrfs_fs_info. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index aabf825e8d7b..811f36d41101 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -173,9 +173,12 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) return -ENOMEM; } - /* Add [0, 1K) */ + /* + * Add [0, 1K) which is inlined. And the extent map length must + * be one block. + */ em->start = 0; - em->len = SZ_1K; + em->len = SZ_4K; em->disk_bytenr = EXTENT_MAP_INLINE; em->disk_num_bytes = 0; em->ram_bytes = SZ_1K; @@ -219,7 +222,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) /* Add [0, 1K) */ em->start = 0; - em->len = SZ_1K; + em->len = SZ_4K; em->disk_bytenr = EXTENT_MAP_INLINE; em->disk_num_bytes = 0; em->ram_bytes = SZ_1K; @@ -235,7 +238,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_4K || em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", @@ -1131,8 +1134,11 @@ int btrfs_test_extent_map(void) /* * Note: the fs_info is not set up completely, we only need * fs_info::fsid for the tracepoint. + * + * And all the immediate numbers are based on 4K blocksize, + * thus we have to use 4K as sectorsize no matter the page size. */ - fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); + fs_info = btrfs_alloc_dummy_fs_info(SZ_4K, SZ_4K); if (!fs_info) { test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; From 71e545d4e33f97258bf7416c132b10a6c1234255 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 22 Jan 2026 09:38:00 +1030 Subject: [PATCH 087/137] btrfs: add strict extent map alignment checks Currently we do not check the alignment of extent_map structure. The reasons are the inode and extent-map tests use unaligned values for start offsets and lengths. Thankfully those legacy problems are properly addressed by previous patches, now we can finally put the alignment checks into validate_extent_map(). Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7e38c23a0c1c..095a561d733f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -319,8 +319,15 @@ static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, /* Internal sanity checks for btrfs debug builds. */ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) { + const u32 blocksize = fs_info->sectorsize; + if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) return; + + if (!IS_ALIGNED(em->start, blocksize) || + !IS_ALIGNED(em->len, blocksize)) + dump_extent_map(fs_info, "unaligned start offset or length members", em); + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { if (em->disk_num_bytes == 0) dump_extent_map(fs_info, "zero disk_num_bytes", em); @@ -334,6 +341,11 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", em); + if (!IS_ALIGNED(em->disk_bytenr, blocksize) || + !IS_ALIGNED(em->disk_num_bytes, blocksize) || + !IS_ALIGNED(em->offset, blocksize) || + !IS_ALIGNED(em->ram_bytes, blocksize)) + dump_extent_map(fs_info, "unaligned members", em); } else if (em->offset) { dump_extent_map(fs_info, "non-zero offset for hole/inline", em); } From c8bafc8d6a78bdc484749816668d6f415d008fe8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Jan 2026 18:17:40 +0100 Subject: [PATCH 088/137] btrfs: embed delayed root to struct btrfs_fs_info The fs_info::delayed_root is allocated dynamically but there's only one instance per filesystem so we can embed it into the fs_info itself. The two object have the same lifetime and delayed roots are always present so we don't need to allocate it on demand from slab. There's still some space left in fs_info until the 4K so there won't be an spill over to next page on release config (size grows from 3880 to 3952). In case we want to shrink fs_info there are still holes to fill or we can separate other non-core or optional structures if needed. Link: https://lore.kernel.org/all/cover.1767979013.git.dsterba@suse.com/ Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 24 ++++++++++++------------ fs/btrfs/delayed-inode.h | 15 --------------- fs/btrfs/disk-io.c | 8 ++------ fs/btrfs/fs.h | 18 ++++++++++++++++-- 4 files changed, 30 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 2286bee2c6d3..a752646257df 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -257,7 +257,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( struct list_head *p; struct btrfs_delayed_node *next = NULL; - delayed_root = node->root->fs_info->delayed_root; + delayed_root = &node->root->fs_info->delayed_root; spin_lock(&delayed_root->lock); if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { /* not in the list */ @@ -287,7 +287,7 @@ static void __btrfs_release_delayed_node( if (!delayed_node) return; - delayed_root = delayed_node->root->fs_info->delayed_root; + delayed_root = &delayed_node->root->fs_info->delayed_root; mutex_lock(&delayed_node->mutex); if (delayed_node->count) @@ -425,7 +425,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, delayed_node->index_cnt = ins->index + 1; delayed_node->count++; - atomic_inc(&delayed_node->root->fs_info->delayed_root->items); + atomic_inc(&delayed_node->root->fs_info->delayed_root.items); return 0; } @@ -452,7 +452,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) /* If it's in a rbtree, then we need to have delayed node locked. */ lockdep_assert_held(&delayed_node->mutex); - delayed_root = delayed_node->root->fs_info->delayed_root; + delayed_root = &delayed_node->root->fs_info->delayed_root; if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; @@ -988,7 +988,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; - delayed_root = delayed_node->root->fs_info->delayed_root; + delayed_root = &delayed_node->root->fs_info->delayed_root; finish_one_item(delayed_root); } } @@ -1002,7 +1002,7 @@ static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) ASSERT(delayed_node->root); delayed_node->count--; - delayed_root = delayed_node->root->fs_info->delayed_root; + delayed_root = &delayed_node->root->fs_info->delayed_root; finish_one_item(delayed_root); } } @@ -1168,7 +1168,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; - delayed_root = fs_info->delayed_root; + delayed_root = &fs_info->delayed_root; curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { @@ -1417,7 +1417,7 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_node *node; - node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); + node = btrfs_first_delayed_node(&fs_info->delayed_root, &delayed_node_tracker); if (WARN_ON(node)) { btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); @@ -1440,7 +1440,7 @@ static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; + struct btrfs_delayed_root *delayed_root = &fs_info->delayed_root; if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || btrfs_workqueue_normal_congested(fs_info->delayed_workers)) @@ -1970,7 +1970,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; - atomic_inc(&root->fs_info->delayed_root->items); + atomic_inc(&root->fs_info->delayed_root.items); release_node: mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); @@ -2012,7 +2012,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) mutex_lock(&delayed_node->mutex); if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { delayed_node->count++; - atomic_inc(&fs_info->delayed_root->items); + atomic_inc(&fs_info->delayed_root.items); } mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); @@ -2118,7 +2118,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(fs_info->delayed_root, + curr_node = btrfs_first_delayed_node(&fs_info->delayed_root, &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b09d4ec8c77d..fc752863f89b 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -30,21 +30,6 @@ enum btrfs_delayed_item_type { BTRFS_DELAYED_DELETION_ITEM }; -struct btrfs_delayed_root { - spinlock_t lock; - struct list_head node_list; - /* - * Used for delayed nodes which is waiting to be dealt with by the - * worker. If the delayed node is inserted into the work queue, we - * drop it from this list. - */ - struct list_head prepare_list; - atomic_t items; /* for delayed items */ - atomic_t items_seq; /* for delayed items */ - int nodes; /* for delayed nodes */ - wait_queue_head_t wait; -}; - struct btrfs_ref_tracker_dir { #ifdef CONFIG_BTRFS_DEBUG struct ref_tracker_dir dir; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 32fffb0557e5..665440ecce12 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -22,6 +22,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" +#include "delayed-inode.h" #include "bio.h" #include "print-tree.h" #include "locking.h" @@ -1217,7 +1218,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); kfree(fs_info->balance_ctl); - kfree(fs_info->delayed_root); free_global_roots(fs_info); btrfs_put_root(fs_info->tree_root); btrfs_put_root(fs_info->chunk_root); @@ -2942,11 +2942,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_KERNEL); - if (!fs_info->delayed_root) - return -ENOMEM; - btrfs_init_delayed_root(fs_info->delayed_root); + btrfs_init_delayed_root(&fs_info->delayed_root); if (sb_rdonly(sb)) set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 5bbc993a66e1..d315530a2928 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -44,7 +44,6 @@ struct btrfs_block_group; struct btrfs_root; struct btrfs_fs_devices; struct btrfs_transaction; -struct btrfs_delayed_root; struct btrfs_balance_control; struct btrfs_subpage_info; struct btrfs_stripe_hash_table; @@ -464,6 +463,21 @@ struct btrfs_commit_stats { u64 critical_section_start_time; }; +struct btrfs_delayed_root { + spinlock_t lock; + struct list_head node_list; + /* + * Used for delayed nodes which is waiting to be dealt with by the + * worker. If the delayed node is inserted into the work queue, we + * drop it from this list. + */ + struct list_head prepare_list; + atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ + int nodes; /* for delayed nodes */ + wait_queue_head_t wait; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -817,7 +831,7 @@ struct btrfs_fs_info { /* Filesystem state */ unsigned long fs_state; - struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_root delayed_root; /* Entries are eb->start >> nodesize_bits */ struct xarray buffer_tree; From 86523d8d2f3ad16e865a0a47e725d6b2cf36fc78 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Jan 2026 18:17:41 +0100 Subject: [PATCH 089/137] btrfs: reorder members in btrfs_delayed_root for better packing There are two unnecessary 4B holes in btrfs_delayed_root; struct btrfs_delayed_root { spinlock_t lock; /* 0 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head node_list; /* 8 16 */ struct list_head prepare_list; /* 24 16 */ atomic_t items; /* 40 4 */ atomic_t items_seq; /* 44 4 */ int nodes; /* 48 4 */ /* XXX 4 bytes hole, try to pack */ wait_queue_head_t wait; /* 56 24 */ /* size: 80, cachelines: 2, members: 7 */ /* sum members: 72, holes: 2, sum holes: 8 */ /* last cacheline: 16 bytes */ }; Reordering 'nodes' after 'lock' reduces size by 8B, to 72 on release config. Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index d315530a2928..d3762fbe7267 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -465,6 +465,7 @@ struct btrfs_commit_stats { struct btrfs_delayed_root { spinlock_t lock; + int nodes; /* for delayed nodes */ struct list_head node_list; /* * Used for delayed nodes which is waiting to be dealt with by the @@ -474,7 +475,6 @@ struct btrfs_delayed_root { struct list_head prepare_list; atomic_t items; /* for delayed items */ atomic_t items_seq; /* for delayed items */ - int nodes; /* for delayed nodes */ wait_queue_head_t wait; }; From 2891539a26899f7cc5c281ce7060a1a8b3631a74 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Jan 2026 18:17:42 +0100 Subject: [PATCH 090/137] btrfs: don't use local variables for fs_info->delayed_root In all cases the delayed_root is used once in a function, we don't need to use a local variable for that. Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a752646257df..fc5926ecc762 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -443,7 +443,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; - struct btrfs_delayed_root *delayed_root; /* Not inserted, ignore it. */ if (RB_EMPTY_NODE(&delayed_item->rb_node)) @@ -452,8 +451,6 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) /* If it's in a rbtree, then we need to have delayed node locked. */ lockdep_assert_held(&delayed_node->mutex); - delayed_root = &delayed_node->root->fs_info->delayed_root; - if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else @@ -462,8 +459,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); delayed_node->count--; - - finish_one_item(delayed_root); + finish_one_item(&delayed_node->root->fs_info->delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -980,30 +976,21 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) { - struct btrfs_delayed_root *delayed_root; - if (delayed_node && test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { ASSERT(delayed_node->root); clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count--; - - delayed_root = &delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + finish_one_item(&delayed_node->root->fs_info->delayed_root); } } static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) { - if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { - struct btrfs_delayed_root *delayed_root; - ASSERT(delayed_node->root); delayed_node->count--; - - delayed_root = &delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + finish_one_item(&delayed_node->root->fs_info->delayed_root); } } @@ -1150,7 +1137,6 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; struct btrfs_path *path; @@ -1168,9 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; - delayed_root = &fs_info->delayed_root; - - curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); + curr_node = btrfs_first_delayed_node(&fs_info->delayed_root, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); From 8ad2f2edc82b8ffde54eab36a677cfb3be2236e1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Jan 2026 18:17:43 +0100 Subject: [PATCH 091/137] btrfs: pass btrfs_fs_info to btrfs_first_delayed_node() As the delayed root is now in the fs_info we can pass it to btrfs_first_delayed_node(). Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc5926ecc762..1739a0b29c49 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -232,19 +232,19 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, } static struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root, + struct btrfs_fs_info *fs_info, struct btrfs_ref_tracker *tracker) { struct btrfs_delayed_node *node; - spin_lock(&delayed_root->lock); - node = list_first_entry_or_null(&delayed_root->node_list, + spin_lock(&fs_info->delayed_root.lock); + node = list_first_entry_or_null(&fs_info->delayed_root.node_list, struct btrfs_delayed_node, n_list); if (node) { refcount_inc(&node->refs); btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); } - spin_unlock(&delayed_root->lock); + spin_unlock(&fs_info->delayed_root.lock); return node; } @@ -1154,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; - curr_node = btrfs_first_delayed_node(&fs_info->delayed_root, &curr_delayed_node_tracker); + curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); @@ -1401,7 +1401,7 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) struct btrfs_ref_tracker delayed_node_tracker; struct btrfs_delayed_node *node; - node = btrfs_first_delayed_node(&fs_info->delayed_root, &delayed_node_tracker); + node = btrfs_first_delayed_node(fs_info, &delayed_node_tracker); if (WARN_ON(node)) { btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker); @@ -2102,8 +2102,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; - curr_node = btrfs_first_delayed_node(&fs_info->delayed_root, - &curr_delayed_node_tracker); + curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); while (curr_node) { __btrfs_kill_delayed_node(curr_node); From 51b1fcf71c88c3c89e7dcf07869c5de837b1f428 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:35:23 +0000 Subject: [PATCH 092/137] btrfs: qgroup: return correct error when deleting qgroup relation item If we fail to delete the second qgroup relation item, we end up returning success or -ENOENT in case the first item does not exist, instead of returning the error from the second item deletion. Fixes: 73798c465b66 ("btrfs: qgroup: Try our best to delete qgroup relations") Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 14d393a5853d..c03bb96d3a34 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1640,8 +1640,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, if (ret < 0 && ret != -ENOENT) goto out; ret2 = del_qgroup_relation_item(trans, dst, src); - if (ret2 < 0 && ret2 != -ENOENT) + if (ret2 < 0 && ret2 != -ENOENT) { + ret = ret2; goto out; + } /* At least one deletion succeeded, return 0 */ if (!ret || !ret2) From 01f93271ed0218937bd6386d6d6deccd92621a38 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 11:25:31 +0000 Subject: [PATCH 093/137] btrfs: remove pointless out labels from ioctl.c Some functions (__btrfs_ioctl_snap_create(), btrfs_ioctl_subvol_setflags() and copy_to_sk()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting up the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d9e7dd317670..f1b56be6f8f4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1176,7 +1176,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, bool readonly, struct btrfs_qgroup_inherit *inherit) { - int ret = 0; + int ret; struct qstr qname = QSTR_INIT(name, strlen(name)); if (!S_ISDIR(file_inode(file)->i_mode)) @@ -1184,7 +1184,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = mnt_want_write_file(file); if (ret) - goto out; + return ret; if (strchr(name, '/')) { ret = -EINVAL; @@ -1236,7 +1236,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } out_drop_write: mnt_drop_write_file(file); -out: return ret; } @@ -1352,14 +1351,14 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, struct btrfs_trans_handle *trans; u64 root_flags; u64 flags; - int ret = 0; + int ret; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; ret = mnt_want_write_file(file); if (ret) - goto out; + return ret; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; @@ -1428,7 +1427,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, up_write(&fs_info->subvol_sem); out_drop_write: mnt_drop_write_file(file); -out: return ret; } @@ -1494,10 +1492,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, continue; if (sizeof(sh) + item_len > *buf_size) { - if (*num_found) { - ret = 1; - goto out; - } + if (*num_found) + return 1; /* * return one empty item back for v1, which does not @@ -1509,10 +1505,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, ret = -EOVERFLOW; } - if (sizeof(sh) + item_len + *sk_offset > *buf_size) { - ret = 1; - goto out; - } + if (sizeof(sh) + item_len + *sk_offset > *buf_size) + return 1; sh.objectid = key->objectid; sh.type = key->type; @@ -1526,10 +1520,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, * problem. Otherwise we'll fault and then copy the buffer in * properly this next time through */ - if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = 0; - goto out; - } + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) + return 0; *sk_offset += sizeof(sh); @@ -1541,22 +1533,20 @@ static noinline int copy_to_sk(struct btrfs_path *path, */ if (read_extent_buffer_to_user_nofault(leaf, up, item_off, item_len)) { - ret = 0; *sk_offset -= sizeof(sh); - goto out; + return 0; } *sk_offset += item_len; } (*num_found)++; - if (ret) /* -EOVERFLOW from above */ - goto out; + /* -EOVERFLOW from above. */ + if (ret) + return ret; - if (*num_found >= sk->nr_items) { - ret = 1; - goto out; - } + if (*num_found >= sk->nr_items) + return 1; } advance_key: ret = 0; @@ -1576,7 +1566,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, key->objectid++; } else ret = 1; -out: + /* * 0: all items from this leaf copied, continue with next * 1: * more items can be copied, but unused buffer is too small From ccba88cb6ac0f936c0adcaf4c1a213086c7f3457 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 11:29:02 +0000 Subject: [PATCH 094/137] btrfs: remove pointless out labels from send.c Some functions (process_extent(), process_recorded_refs_if_needed(), changed_inode(), compare_refs() and changed_cb()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 76 +++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d8127a7120c2..3dcfdba018b5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6449,11 +6449,9 @@ static int process_extent(struct send_ctx *sctx, if (sctx->parent_root && !sctx->cur_inode_new) { ret = is_extent_unchanged(sctx, path, key); if (ret < 0) - goto out; - if (ret) { - ret = 0; + return ret; + if (ret) goto out_hole; - } } else { struct btrfs_file_extent_item *ei; u8 type; @@ -6469,31 +6467,25 @@ static int process_extent(struct send_ctx *sctx, * we have enough commands queued up to justify rev'ing * the send spec. */ - if (type == BTRFS_FILE_EXTENT_PREALLOC) { - ret = 0; - goto out; - } + if (type == BTRFS_FILE_EXTENT_PREALLOC) + return 0; /* Have a hole, just skip it. */ - if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { - ret = 0; - goto out; - } + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) + return 0; } } ret = find_extent_clone(sctx, path, key->objectid, key->offset, sctx->cur_inode_size, &found_clone); if (ret != -ENOENT && ret < 0) - goto out; + return ret; ret = send_write_or_clone(sctx, path, key, found_clone); if (ret) - goto out; + return ret; out_hole: - ret = maybe_send_hole(sctx, path, key); -out: - return ret; + return maybe_send_hole(sctx, path, key); } static int process_all_extents(struct send_ctx *sctx) @@ -6535,23 +6527,24 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end, int *pending_move, int *refs_processed) { - int ret = 0; + int ret; if (sctx->cur_ino == 0) - goto out; + return 0; + if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) - goto out; + return 0; + if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) - goto out; + return 0; ret = process_recorded_refs(sctx, pending_move); if (ret < 0) - goto out; + return ret; *refs_processed = 1; -out: - return ret; + return 0; } static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) @@ -6768,7 +6761,7 @@ static void close_current_inode(struct send_ctx *sctx) static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; + int ret; struct btrfs_key *key = sctx->cmp_key; struct btrfs_inode_item *left_ii = NULL; struct btrfs_inode_item *right_ii = NULL; @@ -6860,7 +6853,7 @@ static int changed_inode(struct send_ctx *sctx, if (result == BTRFS_COMPARE_TREE_NEW) { if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { sctx->ignore_cur_inode = true; - goto out; + return 0; } sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = true; @@ -6888,7 +6881,7 @@ static int changed_inode(struct send_ctx *sctx, old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); if (new_nlinks == 0 && old_nlinks == 0) { sctx->ignore_cur_inode = true; - goto out; + return 0; } else if (new_nlinks == 0 || old_nlinks == 0) { sctx->cur_inode_new_gen = 1; } @@ -6914,7 +6907,7 @@ static int changed_inode(struct send_ctx *sctx, ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_DELETED); if (ret < 0) - goto out; + return ret; } /* @@ -6935,11 +6928,11 @@ static int changed_inode(struct send_ctx *sctx, left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) - goto out; + return ret; ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); if (ret < 0) - goto out; + return ret; /* * Advance send_progress now as we did not get * into process_recorded_refs_if_needed in the @@ -6953,10 +6946,10 @@ static int changed_inode(struct send_ctx *sctx, */ ret = process_all_extents(sctx); if (ret < 0) - goto out; + return ret; ret = process_all_new_xattrs(sctx); if (ret < 0) - goto out; + return ret; } } else { sctx->cur_inode_gen = left_gen; @@ -6970,8 +6963,7 @@ static int changed_inode(struct send_ctx *sctx, } } -out: - return ret; + return 0; } /* @@ -7104,20 +7096,20 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, u32 item_size; u32 cur_offset = 0; int ref_name_len; - int ret = 0; /* Easy case, just check this one dirid */ if (key->type == BTRFS_INODE_REF_KEY) { dirid = key->offset; - ret = dir_changed(sctx, dirid); - goto out; + return dir_changed(sctx, dirid); } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { + int ret; + extref = (struct btrfs_inode_extref *)(ptr + cur_offset); dirid = btrfs_inode_extref_parent(leaf, extref); @@ -7127,11 +7119,10 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, continue; ret = dir_changed(sctx, dirid); if (ret) - break; + return ret; last_dirid = dirid; } -out: - return ret; + return 0; } /* @@ -7212,12 +7203,12 @@ static int changed_cb(struct btrfs_path *left_path, ret = finish_inode_if_needed(sctx, 0); if (ret < 0) - goto out; + return ret; /* Ignore non-FS objects */ if (key->objectid == BTRFS_FREE_INO_OBJECTID || key->objectid == BTRFS_FREE_SPACE_OBJECTID) - goto out; + return 0; if (key->type == BTRFS_INODE_ITEM_KEY) { ret = changed_inode(sctx, result); @@ -7234,7 +7225,6 @@ static int changed_cb(struct btrfs_path *left_path, ret = changed_verity(sctx, result); } -out: return ret; } From b3acb158ea1a2c9deb1bbff8360001a6a179dc9b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:48:33 +0000 Subject: [PATCH 095/137] btrfs: remove pointless out labels from qgroup.c Some functions (__del_qgroup_relation() and qgroup_trace_new_subtree_blocks()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c03bb96d3a34..f53c313ab6e4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1613,10 +1613,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, int ret = 0; int ret2; - if (!fs_info->quota_root) { - ret = -ENOTCONN; - goto out; - } + if (!fs_info->quota_root) + return -ENOTCONN; member = find_qgroup_rb(fs_info, src); parent = find_qgroup_rb(fs_info, dst); @@ -1638,12 +1636,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, delete_item: ret = del_qgroup_relation_item(trans, src, dst); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; ret2 = del_qgroup_relation_item(trans, dst, src); - if (ret2 < 0 && ret2 != -ENOENT) { - ret = ret2; - goto out; - } + if (ret2 < 0 && ret2 != -ENOENT) + return ret2; /* At least one deletion succeeded, return 0 */ if (!ret || !ret2) @@ -1657,7 +1653,7 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, squota_check_parent_usage(fs_info, parent); spin_unlock(&fs_info->qgroup_lock); } -out: + return ret; } @@ -2490,13 +2486,11 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, /* This node is old, no need to trace */ if (child_gen < last_snapshot) - goto out; + return ret; eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); dst_path->nodes[cur_level] = eb; dst_path->slots[cur_level] = 0; @@ -2541,7 +2535,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, dst_path->slots[cur_level] = 0; dst_path->locks[cur_level] = 0; } -out: + return ret; } From 3ca4f9d0963e80435c7b69e2a8fd2b683085a3e6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:50:03 +0000 Subject: [PATCH 096/137] btrfs: remove pointless out labels from disk-io.c Some functions (btrfs_validate_extent_buffer() and btrfs_start_pre_rw_mount()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 54 +++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 665440ecce12..20c405a4789d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -370,22 +370,19 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, btrfs_err_rl(fs_info, "bad tree block start, mirror %u want %llu have %llu", eb->read_mirror, eb->start, found_start); - ret = -EIO; - goto out; + return -EIO; } if (unlikely(check_tree_block_fsid(eb))) { btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", eb->start, eb->read_mirror); - ret = -EIO; - goto out; + return -EIO; } found_level = btrfs_header_level(eb); if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "bad tree block level, mirror %u level %d on logical %llu", eb->read_mirror, btrfs_header_level(eb), eb->start); - ret = -EIO; - goto out; + return -EIO; } csum_tree_block(eb, result); @@ -400,18 +397,15 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, BTRFS_CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); - if (unlikely(!ignore_csum)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(!ignore_csum)) + return -EUCLEAN; } if (unlikely(found_level != check->level)) { btrfs_err(fs_info, "level verify failed on logical %llu mirror %u wanted %u found %u", eb->start, eb->read_mirror, check->level, found_level); - ret = -EIO; - goto out; + return -EIO; } if (unlikely(check->transid && btrfs_header_generation(eb) != check->transid)) { @@ -419,8 +413,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, check->transid, btrfs_header_generation(eb)); - ret = -EIO; - goto out; + return -EIO; } if (check->has_first_key) { const struct btrfs_key *expect_key = &check->first_key; @@ -438,14 +431,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, expect_key->type, expect_key->offset, found_key.objectid, found_key.type, found_key.offset); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } } if (check->owner_root) { ret = btrfs_check_eb_owner(eb, check->owner_root); if (ret < 0) - goto out; + return ret; } /* If this is a leaf block and it is corrupt, just return -EIO. */ @@ -459,7 +451,6 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); -out: return ret; } @@ -3071,7 +3062,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to rebuild free space tree: %d", ret); - goto out; + return ret; } } @@ -3082,7 +3073,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to disable free space tree: %d", ret); - goto out; + return ret; } } @@ -3093,7 +3084,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) ret = btrfs_delete_orphan_free_space_entries(fs_info); if (ret < 0) { btrfs_err(fs_info, "failed to delete orphan free space tree entries: %d", ret); - goto out; + return ret; } /* * btrfs_find_orphan_roots() is responsible for finding all the dead @@ -3108,17 +3099,17 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) */ ret = btrfs_find_orphan_roots(fs_info); if (ret) - goto out; + return ret; ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto out; + return ret; down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); - goto out; + return ret; } up_read(&fs_info->cleanup_work_sem); @@ -3127,7 +3118,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); - goto out; + return ret; } if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && @@ -3137,24 +3128,24 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to create free space tree: %d", ret); - goto out; + return ret; } } if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); if (ret) - goto out; + return ret; } ret = btrfs_resume_balance_async(fs_info); if (ret) - goto out; + return ret; ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume dev_replace"); - goto out; + return ret; } btrfs_qgroup_rescan_resume(fs_info); @@ -3165,12 +3156,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (ret) { btrfs_warn(fs_info, "failed to create the UUID tree %d", ret); - goto out; + return ret; } } -out: - return ret; + return 0; } /* From ea8f9210050136bdd14f5e32b04cd01c8bd5c0ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:52:10 +0000 Subject: [PATCH 097/137] btrfs: remove pointless out labels from extent-tree.c Some functions (lookup_extent_data_ref(), __btrfs_mod_ref() and btrfs_free_tree_block()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bd4d134a3380..a91bce05ffb4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -477,7 +477,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_DATA_REF_KEY) - goto fail; + return ret; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); @@ -488,12 +488,11 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_release_path(path); goto again; } - ret = 0; - break; + return 0; } path->slots[0]++; } -fail: + return ret; } @@ -2501,7 +2500,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int i; int action; int level; - int ret = 0; + int ret; if (btrfs_is_testing(fs_info)) return 0; @@ -2553,7 +2552,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, else ret = btrfs_free_extent(trans, &ref); if (ret) - goto fail; + return ret; } else { /* We don't know the owning_root, leave as 0. */ ref.bytenr = btrfs_node_blockptr(buf, i); @@ -2566,12 +2565,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, else ret = btrfs_free_extent(trans, &ref); if (ret) - goto fail; + return ret; } } return 0; -fail: - return ret; } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3575,12 +3572,12 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, return 0; if (btrfs_header_generation(buf) != trans->transid) - goto out; + return 0; if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) - goto out; + return 0; } bg = btrfs_lookup_block_group(fs_info, buf->start); @@ -3588,7 +3585,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); - goto out; + return 0; } /* @@ -3612,7 +3609,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, || btrfs_is_zoned(fs_info)) { pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); - goto out; + return 0; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); @@ -3622,7 +3619,6 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); -out: return 0; } From 46099eaef3716ea31557c5312ee69460b0c57c0c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:54:00 +0000 Subject: [PATCH 098/137] btrfs: remove pointless out labels from free-space-cache.c Some functions (update_cache_item(), find_free_space(), trim_bitmaps(), btrfs_remove_free_space() and cleanup_free_space_cache_v1()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6fb813cc224b..cc075a460a22 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1162,7 +1162,7 @@ update_cache_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); - goto fail; + return ret; } leaf = path->nodes[0]; if (ret > 0) { @@ -1176,7 +1176,7 @@ update_cache_item(struct btrfs_trans_handle *trans, inode->i_size - 1, EXTENT_DELALLOC, NULL); btrfs_release_path(path); - goto fail; + return -ENOENT; } } @@ -1189,9 +1189,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; - -fail: - return -1; } static noinline_for_stack int write_pinned_extent_entries( @@ -2017,7 +2014,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, int ret; if (!ctl->free_space_offset.rb_node) - goto out; + return NULL; again: if (use_bytes_index) { node = rb_first_cached(&ctl->free_space_bytes); @@ -2025,7 +2022,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); if (!entry) - goto out; + return NULL; node = &entry->offset_index; } @@ -2109,7 +2106,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, *bytes = entry->bytes - align_off; return entry; } -out: + return NULL; } @@ -2894,7 +2891,7 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, old_end - (offset + bytes), info->trim_state); WARN_ON(ret); - goto out; + return ret; } } @@ -2906,7 +2903,7 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, out_lock: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); -out: + return ret; } @@ -4006,7 +4003,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, if (async && *total_trimmed) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); - goto out; + return ret; } bytes = min(bytes, end - start); @@ -4067,7 +4064,6 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, if (offset >= end) block_group->discard_cursor = end; -out: return ret; } @@ -4160,20 +4156,20 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *block_group; struct rb_node *node; - int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { + int ret; + block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); if (ret) - goto out; + return ret; node = rb_next(node); } -out: - return ret; + return 0; } int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) From 47c9dbc791e0bfb4ed6b0ce866f9fc848db39e6c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:55:54 +0000 Subject: [PATCH 099/137] btrfs: remove pointless out labels from inode.c Some functions (insert_inline_extent() and insert_reserved_file_extent()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6637e451992f..10609b8199a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -507,7 +507,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) - goto fail; + return ret; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -546,7 +546,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ret = btrfs_inode_set_file_extent_range(inode, 0, ALIGN(size, root->fs_info->sectorsize)); if (ret) - goto fail; + return ret; /* * We're an inline extent, so nobody can extend the file past i_size @@ -562,8 +562,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } inode->disk_i_size = i_size; -fail: - return ret; + return 0; } static bool can_cow_file_range_inline(struct btrfs_inode *inode, @@ -3037,7 +3036,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, drop_args.extent_item_size = sizeof(*stack_fi); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) - goto out; + return ret; if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); @@ -3047,7 +3046,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) - goto out; + return ret; } leaf = path->nodes[0]; btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); @@ -3082,13 +3081,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) - goto out; + return ret; - ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), - file_pos - offset, - qgroup_reserved, &ins); -out: - return ret; + return btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), + file_pos - offset, + qgroup_reserved, &ins); } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, From 1038614e8f2c045561db0cf2b064a0e5cb909a1c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:58:06 +0000 Subject: [PATCH 100/137] btrfs: remove pointless out labels from uuid-tree.c Some functions (btrfs_uuid_iter_rem() and btrfs_check_uuid_tree_entry()) have an 'out' label that does nothing but return, making it pointless. Simplify this by removing the label and returning instead of gotos plus setting the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index e3a1310fa7d5..f24c14b9bb2f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -207,15 +207,11 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, /* 1 - for the uuid item */ trans = btrfs_start_transaction(uuid_root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); btrfs_end_transaction(trans); - -out: return ret; } @@ -235,14 +231,14 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, if (type != BTRFS_UUID_KEY_SUBVOL && type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) - goto out; + return 0; subvol_root = btrfs_get_fs_root(fs_info, subvolid, true); if (IS_ERR(subvol_root)) { ret = PTR_ERR(subvol_root); if (ret == -ENOENT) - ret = 1; - goto out; + return 1; + return ret; } switch (type) { @@ -257,7 +253,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, break; } btrfs_put_root(subvol_root); -out: + return ret; } From 2efcd25a7638dd217e2fea01107416e24a2d1935 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 19:59:13 +0000 Subject: [PATCH 101/137] btrfs: remove out label in load_extent_tree_free() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 28aa87872f0d..3a0521236ecd 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -766,7 +766,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -797,7 +797,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto out; + return ret; if (ret) break; leaf = path->nodes[0]; @@ -828,7 +828,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) ret = btrfs_add_new_free_space(block_group, last, key.objectid, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + @@ -847,9 +847,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->slots[0]++; } - ret = btrfs_add_new_free_space(block_group, last, block_group_end, NULL); -out: - return ret; + return btrfs_add_new_free_space(block_group, last, block_group_end, NULL); } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) From bb09b9a4917cb5f040dbce66c236c9adae2eeaea Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:01:31 +0000 Subject: [PATCH 102/137] btrfs: remove out_failed label in find_lock_delalloc_range() There is no point in having the label since all it does is return the value in the 'found' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dfc17c292217..3df399dc8856 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -440,8 +440,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, loops = 1; goto again; } else { - found = false; - goto out_failed; + return false; } } @@ -461,7 +460,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, } *start = delalloc_start; *end = delalloc_end; -out_failed: + return found; } From cc27540dd09571938cf8e9c80a311b403ac073c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:02:51 +0000 Subject: [PATCH 103/137] btrfs: remove out label in btrfs_csum_file_blocks() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 568f0e0ebdf6..7bd715442f3e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1134,7 +1134,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, } ret = PTR_ERR(item); if (ret != -EFBIG && ret != -ENOENT) - goto out; + return ret; if (ret == -EFBIG) { u32 item_size; @@ -1150,7 +1150,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, /* We didn't find a csum item, insert one. */ ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) - goto out; + return ret; found_next = 1; goto insert; } @@ -1178,7 +1178,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, csum_size, 1); path->search_for_extension = false; if (ret < 0) - goto out; + return ret; if (ret > 0) { if (path->slots[0] == 0) @@ -1234,14 +1234,14 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[0])) { ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) - goto out; + return ret; found_next = 1; goto insert; } ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) - goto out; + return ret; tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; if (tmp <= INT_MAX) @@ -1282,7 +1282,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -1307,8 +1307,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, cond_resched(); goto again; } -out: - return ret; + + return 0; } void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, From 610ff1c9df5499aed83d6a1ca2e9e9a2aefecc13 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:03:45 +0000 Subject: [PATCH 104/137] btrfs: remove out label in btrfs_mark_extent_written() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1759776d2d57..56ece1109832 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -565,7 +565,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_nr = 0; int del_slot = 0; int recow; - int ret = 0; + int ret; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -580,7 +580,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -589,20 +589,20 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (unlikely(key.offset > start || extent_end < end)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -632,7 +632,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - goto out; + return 0; } } @@ -660,7 +660,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - goto out; + return 0; } } @@ -676,7 +676,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -704,7 +704,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (split == start) { @@ -713,7 +713,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, if (unlikely(start != key.offset)) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } path->slots[0]--; extent_end = end; @@ -744,7 +744,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } other_start = 0; @@ -762,7 +762,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, &ref); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } if (del_nr == 0) { @@ -783,11 +783,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } -out: - return ret; + + return 0; } /* From 55807025a63fc727527cea790792e08c08e2e3c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:04:26 +0000 Subject: [PATCH 105/137] btrfs: remove out label in lzo_decompress() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 4024ce416971..bd5ee82080fa 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -484,7 +484,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, size_t in_len; size_t out_len; size_t max_segment_len = workspace_buf_length(fs_info); - int ret = 0; + int ret; if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; @@ -495,10 +495,8 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, data_in += LZO_LEN; in_len = read_compress_length(data_in); - if (unlikely(in_len != srclen - LZO_LEN * 2)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(in_len != srclen - LZO_LEN * 2)) + return -EUCLEAN; data_in += LZO_LEN; out_len = sectorsize; @@ -510,19 +508,18 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(dest_folio)); - ret = -EIO; - goto out; + return -EIO; } ASSERT(out_len <= sectorsize); memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { - ret = -EIO; folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); + return -EIO; } -out: - return ret; + + return 0; } const struct btrfs_compress_levels btrfs_lzo_compress = { From 6329592ca65222183aaebb377b8a828ecf85e55f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:05:02 +0000 Subject: [PATCH 106/137] btrfs: remove out label in scrub_find_fill_first_stripe() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0bd4aebe1687..2a64e2d50ced 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1696,7 +1696,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, logical_len); /* Either error or not found. */ if (ret) - goto out; + return ret; get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) @@ -1729,7 +1729,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) - goto out; + return ret; if (ret > 0) { ret = 0; break; @@ -1763,7 +1763,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, stripe->logical, stripe_end, stripe->csums, &csum_bitmap); if (ret < 0) - goto out; + return ret; if (ret > 0) ret = 0; @@ -1773,7 +1773,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); -out: + return ret; } From 61fb7f04ee06e6c7e113a490af1a057f958f1d05 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:05:43 +0000 Subject: [PATCH 107/137] btrfs: remove out label in finish_verity() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/verity.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index a2ac3fb68bc8..06cbd6f00a78 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -525,23 +525,21 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0, (const char *)&item, sizeof(item)); if (ret) - goto out; + return ret; /* Write out the descriptor itself */ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1, desc, desc_size); if (ret) - goto out; + return ret; /* * 1 for updating the inode flag * 1 for deleting the orphan */ trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); inode->ro_flags |= BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); @@ -554,8 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, btrfs_set_fs_compat_ro(root->fs_info, VERITY); end_trans: btrfs_end_transaction(trans); -out: - return ret; + return 0; } From cefef3cc128076813a7ba5cf34f80130dce3f0a2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:06:20 +0000 Subject: [PATCH 108/137] btrfs: remove out label in btrfs_check_rw_degradable() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index af0197b242a7..cff2412bc879 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7576,10 +7576,9 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ - if (!map) { - ret = false; - goto out; - } + if (!map) + return false; + while (map) { int missing = 0; int max_tolerated; @@ -7604,15 +7603,14 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", map->start, missing, max_tolerated); btrfs_free_chunk_map(map); - ret = false; - goto out; + return false; } next_start = map->start + map->chunk_len; btrfs_free_chunk_map(map); map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } -out: + return ret; } From 5eb01bf4a9407e8d825ac9ee5b5a1ef2c1972e61 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:06:57 +0000 Subject: [PATCH 109/137] btrfs: remove out label in btrfs_init_space_info() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bc493243f777..bb5aac7ee9d2 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -329,7 +329,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) struct btrfs_super_block *disk_super; u64 features; u64 flags; - int mixed = 0; + bool mixed = false; int ret; disk_super = fs_info->super_copy; @@ -338,28 +338,28 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; + mixed = true; flags = BTRFS_BLOCK_GROUP_SYSTEM; ret = create_space_info(fs_info, flags); if (ret) - goto out; + return ret; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); if (ret) - goto out; + return ret; } else { flags = BTRFS_BLOCK_GROUP_METADATA; ret = create_space_info(fs_info, flags); if (ret) - goto out; + return ret; flags = BTRFS_BLOCK_GROUP_DATA; ret = create_space_info(fs_info, flags); if (ret) - goto out; + return ret; } if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { @@ -367,7 +367,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) ret = create_space_info(fs_info, flags); } -out: return ret; } From 3f8982543dae28159cec5fad33c1b3f5cd12314b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jan 2026 20:07:32 +0000 Subject: [PATCH 110/137] btrfs: remove out label in btrfs_wait_for_commit() There is no point in having the label since all it does is return the value in the 'ret' variable. Instead make every goto return directly and remove the label. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f4cc9e1a1b93..8aa55cd8a0bf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -950,7 +950,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (transid) { if (transid <= btrfs_get_last_trans_committed(fs_info)) - goto out; + return 0; /* find specified transaction */ spin_lock(&fs_info->trans_lock); @@ -975,7 +975,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (!cur_trans) { if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; - goto out; + return ret; } } else { /* find newest transaction that is committing | committed */ @@ -991,14 +991,15 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) } } spin_unlock(&fs_info->trans_lock); + /* Nothing committing or committed. */ if (!cur_trans) - goto out; /* nothing committing|committed */ + return ret; } wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); -out: + return ret; } From dda3ec9ee6b3e120603bff1b798f25b51e54ac5d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 17 Dec 2025 20:14:04 +0900 Subject: [PATCH 111/137] btrfs: zoned: fixup last alloc pointer after extent removal for RAID1 When a block group is composed of a sequential write zone and a conventional zone, we recover the (pseudo) write pointer of the conventional zone using the end of the last allocated position. However, if the last extent in a block group is removed, the last extent position will be smaller than the other real write pointer position. Then, that will cause an error due to mismatch of the write pointers. We can fixup this case by moving the alloc_offset to the corresponding write pointer position. Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d6a2480d5dc1..714f45045c84 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1491,6 +1491,21 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, /* In case a device is missing we have a cap of 0, so don't use it. */ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + if (last_alloc <= zone_info[i].alloc_offset) { + last_alloc = zone_info[i].alloc_offset; + break; + } + } + for (i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; From e2d848649e64de39fc1b9c64002629b4daa1105d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 23 Jan 2026 21:41:35 +0900 Subject: [PATCH 112/137] btrfs: zoned: fixup last alloc pointer after extent removal for DUP When a block group is composed of a sequential write zone and a conventional zone, we recover the (pseudo) write pointer of the conventional zone using the end of the last allocated position. However, if the last extent in a block group is removed, the last extent position will be smaller than the other real write pointer position. Then, that will cause an error due to mismatch of the write pointers. We can fixup this case by moving the alloc_offset to the corresponding write pointer position. Fixes: c0d90a79e8e6 ("btrfs: zoned: fix alloc_offset calculation for partly conventional block groups") CC: stable@vger.kernel.org # 6.16+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 714f45045c84..a10e1076c881 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1450,6 +1450,20 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, return -EIO; } + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + if (last_alloc <= zone_info[i].alloc_offset) { + last_alloc = zone_info[i].alloc_offset; + break; + } + } + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) zone_info[0].alloc_offset = last_alloc; From 52ee9965d09b2c56a027613db30d1fb20d623861 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 23 Jan 2026 21:41:36 +0900 Subject: [PATCH 113/137] btrfs: zoned: fixup last alloc pointer after extent removal for RAID0/10 When a block group is composed of a sequential write zone and a conventional zone, we recover the (pseudo) write pointer of the conventional zone using the end of the last allocated position. However, if the last extent in a block group is removed, the last extent position will be smaller than the other real write pointer position. Then, that will cause an error due to mismatch of the write pointers. We can fixup this case by moving the alloc_offset to the corresponding write pointer position. Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 194 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 179 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a10e1076c881..7fa60a44d716 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1561,7 +1561,9 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, { struct btrfs_fs_info *fs_info = bg->fs_info; u64 stripe_nr = 0, stripe_offset = 0; + u64 prev_offset = 0; u32 stripe_index = 0; + bool has_partial = false, has_conventional = false; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1569,6 +1571,35 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (int i = 0; i < map->num_stripes; i++) { + u64 alloc; + + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + stripe_nr = zone_info[i].alloc_offset >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK; + if (stripe_offset == 0 && stripe_nr > 0) { + stripe_nr--; + stripe_offset = BTRFS_STRIPE_LEN; + } + alloc = ((stripe_nr * map->num_stripes + i) << BTRFS_STRIPE_LEN_SHIFT) + + stripe_offset; + last_alloc = max(last_alloc, alloc); + + /* Partially written stripe found. It should be last. */ + if (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) + break; + } + stripe_nr = 0; + stripe_offset = 0; + if (last_alloc) { u32 factor = map->num_stripes; @@ -1582,7 +1613,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - + has_conventional = true; zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) @@ -1591,6 +1622,28 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, zone_info[i].alloc_offset += stripe_offset; } + /* Verification */ + if (i != 0) { + if (unlikely(prev_offset < zone_info[i].alloc_offset)) { + btrfs_err(fs_info, + "zoned: stripe position disorder found in block group %llu", + bg->start); + return -EIO; + } + + if (unlikely(has_partial && + (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK))) { + btrfs_err(fs_info, + "zoned: multiple partial written stripe found in block group %llu", + bg->start); + return -EIO; + } + } + prev_offset = zone_info[i].alloc_offset; + + if ((zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) != 0) + has_partial = true; + if (test_bit(0, active) != test_bit(i, active)) { if (unlikely(!btrfs_zone_activate(bg))) return -EIO; @@ -1602,6 +1655,19 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, bg->alloc_offset += zone_info[i].alloc_offset; } + /* Check if all devices stay in the same stripe row. */ + if (unlikely(zone_info[0].alloc_offset - + zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", bg->start); + return -EIO; + } + + if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) { + btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu", + bg->alloc_offset, last_alloc); + return -EIO; + } + return 0; } @@ -1612,8 +1678,11 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 AUTO_KFREE(raid0_allocs); u64 stripe_nr = 0, stripe_offset = 0; u32 stripe_index = 0; + bool has_partial = false, has_conventional = false; + u64 prev_offset = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1621,6 +1690,60 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs), + GFP_NOFS); + if (!raid0_allocs) + return -ENOMEM; + + /* + * When the last extent is removed, last_alloc can be smaller than the other write + * pointer. In that case, last_alloc should be moved to the corresponding write + * pointer position. + */ + for (int i = 0; i < map->num_stripes; i += map->sub_stripes) { + u64 alloc = zone_info[i].alloc_offset; + + for (int j = 1; j < map->sub_stripes; j++) { + int idx = i + j; + + if (zone_info[idx].alloc_offset == WP_MISSING_DEV || + zone_info[idx].alloc_offset == WP_CONVENTIONAL) + continue; + if (alloc == WP_MISSING_DEV || alloc == WP_CONVENTIONAL) { + alloc = zone_info[idx].alloc_offset; + } else if (unlikely(zone_info[idx].alloc_offset != alloc)) { + btrfs_err(fs_info, + "zoned: write pointer mismatch found in block group %llu", + bg->start); + return -EIO; + } + } + + raid0_allocs[i / map->sub_stripes] = alloc; + if (alloc == WP_CONVENTIONAL) + continue; + if (unlikely(alloc == WP_MISSING_DEV)) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer of block group %llu due to missing device", + bg->start); + return -EIO; + } + + stripe_nr = alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = alloc & BTRFS_STRIPE_LEN_MASK; + if (stripe_offset == 0 && stripe_nr > 0) { + stripe_nr--; + stripe_offset = BTRFS_STRIPE_LEN; + } + + alloc = ((stripe_nr * (map->num_stripes / map->sub_stripes) + + (i / map->sub_stripes)) << + BTRFS_STRIPE_LEN_SHIFT) + stripe_offset; + last_alloc = max(last_alloc, alloc); + } + stripe_nr = 0; + stripe_offset = 0; + if (last_alloc) { u32 factor = map->num_stripes / map->sub_stripes; @@ -1630,24 +1753,51 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV) - continue; + int idx = i / map->sub_stripes; + + if (raid0_allocs[idx] == WP_CONVENTIONAL) { + has_conventional = true; + raid0_allocs[idx] = btrfs_stripe_nr_to_offset(stripe_nr); + + if (stripe_index > idx) + raid0_allocs[idx] += BTRFS_STRIPE_LEN; + else if (stripe_index == idx) + raid0_allocs[idx] += stripe_offset; + } + + if ((i % map->sub_stripes) == 0) { + /* Verification */ + if (i != 0) { + if (unlikely(prev_offset < raid0_allocs[idx])) { + btrfs_err(fs_info, + "zoned: stripe position disorder found in block group %llu", + bg->start); + return -EIO; + } + + if (unlikely(has_partial && + (raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK))) { + btrfs_err(fs_info, + "zoned: multiple partial written stripe found in block group %llu", + bg->start); + return -EIO; + } + } + prev_offset = raid0_allocs[idx]; + + if ((raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK) != 0) + has_partial = true; + } + + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = raid0_allocs[idx]; if (test_bit(0, active) != test_bit(i, active)) { if (unlikely(!btrfs_zone_activate(bg))) return -EIO; - } else { - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); - } - - if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); - - if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; - else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += stripe_offset; + } else if (test_bit(0, active)) { + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } if ((i % map->sub_stripes) == 0) { @@ -1656,6 +1806,20 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } } + /* Check if all devices stay in the same stripe row. */ + if (unlikely(zone_info[0].alloc_offset - + zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", + bg->start); + return -EIO; + } + + if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) { + btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu", + bg->alloc_offset, last_alloc); + return -EIO; + } + return 0; } From 3fe608dbac8c3d714472fab424bf522f39a3f60b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 23 Jan 2026 09:14:44 +0100 Subject: [PATCH 114/137] btrfs: zoned: use local fs_info variable in btrfs_load_block_group_dup() btrfs_load_block_group_dup() has a local pointer to fs_info, yet the error prints dereference fs_info from the block_group. Use local fs_info variable to make the code more uniform. Reviewed-by: Daniel Vacek Reviewed-by: Qu Wenruo Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7fa60a44d716..b792136e3d08 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1438,13 +1438,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { - btrfs_err(bg->fs_info, + btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[0].physical); return -EIO; } if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { - btrfs_err(bg->fs_info, + btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", zone_info[1].physical); return -EIO; @@ -1471,7 +1471,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].alloc_offset = last_alloc; if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { - btrfs_err(bg->fs_info, + btrfs_err(fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); return -EIO; } From c757edbef980d4e5d70749d3c98e00279af58b96 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 26 Jan 2026 09:05:24 +0100 Subject: [PATCH 115/137] btrfs: fix copying the flags of btrfs_bio after split When a btrfs_bio gets split, only 'bbio->csum_search_commit_root' gets copied to the new btrfs_bio, all the other flags don't. When a bio is split in btrfs_submit_chunk(), btrfs_split_bio() creates the new split bio via btrfs_bio_init() which zeroes the struct with memset. Looking at btrfs_split_bio(), it copies csum_search_commit_root from the original but does not copy can_use_append. After the split, the code does: bbio = split; bio = &bbio->bio; This means the split bio (with can_use_append = false) gets submitted, not the original. In btrfs_submit_dev_bio(), the condition: if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(...)) Will be false for the split bio even when writing to a sequential zone. Does the split bio need to inherit can_use_append from the original? The old code used a local variable use_append which persisted across the split. Copy the rest of the flags as well. Link: https://lore.kernel.org/linux-btrfs/20260125132120.2525146-1-clm@meta.com/ Reported-by: Chris Mason Reviewed-by: Qu Wenruo Reviewed-by: Boris Burkov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d3475d179362..0a69e09bfe28 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,7 +97,13 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bbio->orig_logical = orig_bbio->orig_logical; orig_bbio->orig_logical += map_length; } + bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; + bbio->can_use_append = orig_bbio->can_use_append; + bbio->is_scrub = orig_bbio->is_scrub; + bbio->is_remap = orig_bbio->is_remap; + bbio->async_csum = orig_bbio->async_csum; + atomic_inc(&orig_bbio->pending_ios); return bbio; } From 3a1f4264daed4b419c325a7fe35e756cada3cf82 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 22 Dec 2025 16:15:44 -0800 Subject: [PATCH 116/137] btrfs: fix block_group_tree dirty_list corruption When the incompat flag EXTENT_TREE_V2 is set, we unconditionally add the block group tree to the switch_commits list before calling switch_commit_roots, as we do for the tree root and the chunk root. However, the block group tree uses normal root dirty tracking and in any transaction that does an allocation and dirties a block group, the block group root will already be linked to a list by the dirty_list field and this use of list_add_tail() is invalid and corrupts the prev/next members of block_group_root->dirty_list. This is apparent on a subsequent list_del on the prev if we enable CONFIG_DEBUG_LIST: [32.1571] ------------[ cut here ]------------ [32.1572] list_del corruption. next->prev should beffff958890202538, but was ffff9588992bd538. (next=ffff958890201538) [32.1575] WARNING: lib/list_debug.c:65 at 0x0, CPU#3: sync/607 [32.1583] CPU: 3 UID: 0 PID: 607 Comm: sync Not tainted 6.18.0 #24PREEMPT(none) [32.1585] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS1.17.0-4.fc41 04/01/2014 [32.1587] RIP: 0010:__list_del_entry_valid_or_report+0x108/0x120 [32.1593] RSP: 0018:ffffaa288287fdd0 EFLAGS: 00010202 [32.1594] RAX: 0000000000000001 RBX: ffff95889326e800 RCX:ffff958890201538 [32.1596] RDX: ffff9588992bd538 RSI: ffff958890202538 RDI:ffffffff82a41e00 [32.1597] RBP: ffff958890202538 R08: ffffffff828fc1e8 R09:00000000ffffefff [32.1599] R10: ffffffff8288c200 R11: ffffffff828e4200 R12:ffff958890201538 [32.1601] R13: ffff95889326e958 R14: ffff958895c24000 R15:ffff958890202538 [32.1603] FS: 00007f0c28eb5740(0000) GS:ffff958af2bd2000(0000)knlGS:0000000000000000 [32.1605] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [32.1607] CR2: 00007f0c28e8a3cc CR3: 0000000109942005 CR4:0000000000370ef0 [32.1609] Call Trace: [32.1610] [32.1611] switch_commit_roots+0x82/0x1d0 [btrfs] [32.1615] btrfs_commit_transaction+0x968/0x1550 [btrfs] [32.1618] ? btrfs_attach_transaction_barrier+0x23/0x60 [btrfs] [32.1621] __iterate_supers+0xe8/0x190 [32.1622] ? __pfx_sync_fs_one_sb+0x10/0x10 [32.1623] ksys_sync+0x63/0xb0 [32.1624] __do_sys_sync+0xe/0x20 [32.1625] do_syscall_64+0x73/0x450 [32.1626] entry_SYSCALL_64_after_hwframe+0x76/0x7e [32.1627] RIP: 0033:0x7f0c28d05d2b [32.1632] RSP: 002b:00007ffc9d988048 EFLAGS: 00000246 ORIG_RAX:00000000000000a2 [32.1634] RAX: ffffffffffffffda RBX: 00007ffc9d988228 RCX:00007f0c28d05d2b [32.1636] RDX: 00007f0c28e02301 RSI: 00007ffc9d989b21 RDI:00007f0c28dba90d [32.1637] RBP: 0000000000000001 R08: 0000000000000001 R09:0000000000000000 [32.1639] R10: 0000000000000000 R11: 0000000000000246 R12:000055b96572cb80 [32.1641] R13: 000055b96572b19f R14: 00007f0c28dfa434 R15:000055b96572b034 [32.1643] [32.1644] irq event stamp: 0 [32.1644] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [32.1646] hardirqs last disabled at (0): []copy_process+0xb37/0x2260 [32.1648] softirqs last enabled at (0): []copy_process+0xb37/0x2260 [32.1650] softirqs last disabled at (0): [<0000000000000000>] 0x0 [32.1652] ---[ end trace 0000000000000000 ]--- Furthermore, this list corruption eventually (when we happen to add a new block group) results in getting the switch_commits and dirty_cowonly_roots lists mixed up and attempting to call update_root on the tree root which can't be found in the tree root, resulting in a transaction abort: [87.8269] BTRFS critical (device nvme1n1): unable to find root key (1 0 0) in tree 1 [87.8272] ------------[ cut here ]------------ [87.8274] BTRFS: Transaction aborted (error -117) [87.8275] WARNING: fs/btrfs/root-tree.c:153 at 0x0, CPU#4: sync/703 [87.8285] CPU: 4 UID: 0 PID: 703 Comm: sync Not tainted 6.18.0 #25 PREEMPT(none) [87.8287] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-4.fc41 04/01/2014 [87.8289] RIP: 0010:btrfs_update_root+0x296/0x790 [btrfs] [87.8295] RSP: 0018:ffffa58d035dfd60 EFLAGS: 00010282 [87.8297] RAX: ffff9a59126ddb68 RBX: ffff9a59126dc000 RCX: 0000000000000000 [87.8299] RDX: 0000000000000000 RSI: 00000000ffffff8b RDI: ffffffffc0b28270 [87.8301] RBP: ffff9a5904aec000 R08: 0000000000000000 R09: 00000000ffffefff [87.8303] R10: ffffffff9ac8c200 R11: ffffffff9ace4200 R12: 0000000000000001 [87.8305] R13: ffff9a59041740e8 R14: ffff9a5904aec1f7 R15: ffff9a590fdefaf0 [87.8307] FS: 00007f54cde6b740(0000) GS:ffff9a5b5a81c000(0000) knlGS:0000000000000000 [87.8309] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [87.8310] CR2: 00007f54cde403cc CR3: 0000000112902004 CR4: 0000000000370ef0 [87.8312] Call Trace: [87.8313] [87.8314] ? _raw_spin_unlock+0x23/0x40 [87.8315] commit_cowonly_roots+0x1ad/0x250 [btrfs] [87.8317] ? btrfs_commit_transaction+0x79b/0x1560 [btrfs] [87.8320] btrfs_commit_transaction+0x8aa/0x1560 [btrfs] [87.8322] ? btrfs_attach_transaction_barrier+0x23/0x60 [btrfs] [87.8325] __iterate_supers+0xf1/0x170 [87.8326] ? __pfx_sync_fs_one_sb+0x10/0x10 [87.8327] ksys_sync+0x63/0xb0 [87.8328] __do_sys_sync+0xe/0x20 [87.8329] do_syscall_64+0x73/0x450 [87.8330] entry_SYSCALL_64_after_hwframe+0x76/0x7e [87.8331] RIP: 0033:0x7f54cdd05d2b [87.8336] RSP: 002b:00007fff1b58ff78 EFLAGS: 00000246 ORIG_RAX: 00000000000000a2 [87.8338] RAX: ffffffffffffffda RBX: 00007fff1b590158 RCX: 00007f54cdd05d2b [87.8340] RDX: 00007f54cde02301 RSI: 00007fff1b592b66 RDI: 00007f54cddba90d [87.8342] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 [87.8344] R10: 0000000000000000 R11: 0000000000000246 R12: 000055e07ca96b80 [87.8346] R13: 000055e07ca9519f R14: 00007f54cddfa434 R15: 000055e07ca95034 [87.8348] [87.8348] irq event stamp: 0 [87.8349] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [87.8351] hardirqs last disabled at (0): [] copy_process+0xb37/0x21e0 [87.8353] softirqs last enabled at (0): [] copy_process+0xb37/0x21e0 [87.8355] softirqs last disabled at (0): [<0000000000000000>] 0x0 [87.8357] ---[ end trace 0000000000000000 ]--- [87.8358] BTRFS: error (device nvme1n1 state A) in btrfs_update_root:153: errno=-117 Filesystem corrupted [87.8360] BTRFS info (device nvme1n1 state EA): forced readonly [87.8362] BTRFS warning (device nvme1n1 state EA): Skipping commit of aborted transaction. [87.8364] BTRFS: error (device nvme1n1 state EA) in cleanup_transaction:2037: errno=-117 Filesystem corrupted Since the block group tree was pulled out of the extent tree and uses normal root dirty tracking, remove the offending extra list_add. This fixes the list corruption and the resulting fs corruption. Fixes: 14033b08a029 ("btrfs: don't save block group root into super block") Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8aa55cd8a0bf..0b2498749b1e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2508,13 +2508,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_set_root_node(&fs_info->block_group_root->root_item, - fs_info->block_group_root->node); - list_add_tail(&fs_info->block_group_root->dirty_list, - &cur_trans->switch_commits); - } - switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); From 6f926597f9837577f5ada47eaa764fea4a2ca9a3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 23 Jan 2026 09:49:57 +0000 Subject: [PATCH 117/137] btrfs: abort transaction on error in btrfs_remove_block_group() When btrfs_remove_block_group() fails we abort the transaction in its single caller (btrfs_remove_chunk()). This makes it harder to find out where exactly the failure happened, as several steps inside btrfs_remove_block_group() can fail. So make btrfs_remove_block_group() abort the transaction whenever an error happens, instead of aborting in its caller. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 19 ++++++++++++++----- fs/btrfs/volumes.c | 7 +++---- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3a0521236ecd..7b723571501e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1108,8 +1108,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, map->start); - if (!block_group) + if (unlikely(!block_group)) { + btrfs_abort_transaction(trans, -ENOENT); return -ENOENT; + } BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)); @@ -1143,8 +1145,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_clear_data_reloc_bg(block_group); path = btrfs_alloc_path(); - if (!path) { + if (unlikely(!path)) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -1180,8 +1183,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&trans->transaction->cache_write_mutex); ret = btrfs_remove_free_space_inode(trans, inode, block_group); - if (ret) + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); goto out; + } write_lock(&fs_info->block_group_cache_lock); rb_erase_cached(&block_group->cache_node, @@ -1268,13 +1273,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, */ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { ret = btrfs_remove_block_group_free_space(trans, block_group); - if (ret) + if (unlikely(ret)) { + btrfs_abort_transaction(trans, ret); goto out; + } } ret = remove_block_group_item(trans, path, block_group); - if (ret < 0) + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); goto out; + } spin_lock(&block_group->lock); /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cff2412bc879..d33780082b8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3384,11 +3384,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) */ btrfs_trans_release_chunk_metadata(trans); + /* On error, btrfs_remove_block_group() aborts the transaction. */ ret = btrfs_remove_block_group(trans, map); - if (unlikely(ret)) { - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(ret)) + ASSERT(BTRFS_FS_ERROR(fs_info) != 0); out: if (trans->removing_chunk) { From 719dc4b75561f7f11ff42ccf8401fcac72d3804f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 23 Jan 2026 10:05:12 +0000 Subject: [PATCH 118/137] btrfs: do not BUG_ON() in btrfs_remove_block_group() There's no need to BUG_ON(), we can just abort the transaction and return an error. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7b723571501e..3186ed4fd26d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1113,7 +1113,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, return -ENOENT; } - BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)); + if (unlikely(!block_group->ro && + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; + } trace_btrfs_remove_block_group(block_group); /* From 912d1c6680bdb40b72b1b9204706f32b6eb842c3 Mon Sep 17 00:00:00 2001 From: jinbaohong Date: Wed, 28 Jan 2026 07:06:38 +0000 Subject: [PATCH 119/137] btrfs: continue trimming remaining devices on failure Commit 93bba24d4b5a ("btrfs: Enhance btrfs_trim_fs function to handle error better") intended to make device trimming continue even if one device fails, tracking failures and reporting them at the end. However, it used 'break' instead of 'continue', causing the loop to exit on the first device failure. Fix this by replacing 'break' with 'continue'. Fixes: 93bba24d4b5a ("btrfs: Enhance btrfs_trim_fs function to handle error better") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Robbie Ko Signed-off-by: jinbaohong Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a91bce05ffb4..b63296e9abf4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6688,7 +6688,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (ret) { dev_failed++; dev_ret = ret; - break; + continue; } } mutex_unlock(&fs_devices->device_list_mutex); From 1cc4ada4182fadb82837831cabf1027bff0322d7 Mon Sep 17 00:00:00 2001 From: jinbaohong Date: Wed, 28 Jan 2026 07:06:39 +0000 Subject: [PATCH 120/137] btrfs: preserve first error in btrfs_trim_fs() When multiple block groups or devices fail during trim, preserve the first error encountered rather than the last one. The first error is typically more useful for debugging as it represents the original failure, while subsequent errors may be cascading effects. Signed-off-by: Robbie Ko Signed-off-by: jinbaohong Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b63296e9abf4..8e405a1011f4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6608,7 +6608,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) * 2) trimming the unallocated space on each device * * This will also continue trimming even if a block group or device encounters - * an error. The return value will be the last error, or 0 if nothing bad + * an error. The return value will be the first error, or 0 if nothing bad * happens. */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) @@ -6653,7 +6653,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; - bg_ret = ret; + if (!bg_ret) + bg_ret = ret; continue; } } @@ -6666,7 +6667,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; if (ret) { bg_failed++; - bg_ret = ret; + if (!bg_ret) + bg_ret = ret; continue; } } @@ -6674,7 +6676,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (bg_failed) btrfs_warn(fs_info, - "failed to trim %llu block group(s), last error %d", + "failed to trim %llu block group(s), first error %d", bg_failed, bg_ret); mutex_lock(&fs_devices->device_list_mutex); @@ -6687,7 +6689,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; if (ret) { dev_failed++; - dev_ret = ret; + if (!dev_ret) + dev_ret = ret; continue; } } @@ -6695,7 +6698,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (dev_failed) btrfs_warn(fs_info, - "failed to trim %llu device(s), last error %d", + "failed to trim %llu device(s), first error %d", dev_failed, dev_ret); range->len = trimmed; if (bg_ret) From bfb670b9183b0e4ba660aff2e396ec1cc01d0761 Mon Sep 17 00:00:00 2001 From: jinbaohong Date: Wed, 28 Jan 2026 07:06:40 +0000 Subject: [PATCH 121/137] btrfs: handle user interrupt properly in btrfs_trim_fs() When a fatal signal is pending or the process is freezing, btrfs_trim_block_group() and btrfs_trim_free_extents() return -ERESTARTSYS. Currently this is treated as a regular error: the loops continue to the next iteration and count it as a block group or device failure. Instead, break out of the loops immediately and return -ERESTARTSYS to userspace without counting it as a failure. Also skip the device loop entirely if the block group loop was interrupted. Reviewed-by: Qu Wenruo Signed-off-by: Robbie Ko Signed-off-by: jinbaohong Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8e405a1011f4..87fd94449f11 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6665,6 +6665,10 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) range->minlen); trimmed += group_trimmed; + if (ret == -ERESTARTSYS || ret == -EINTR) { + btrfs_put_block_group(cache); + break; + } if (ret) { bg_failed++; if (!bg_ret) @@ -6679,6 +6683,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) "failed to trim %llu block group(s), first error %d", bg_failed, bg_ret); + if (ret == -ERESTARTSYS || ret == -EINTR) + return ret; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) @@ -6687,6 +6694,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) ret = btrfs_trim_free_extents(device, &group_trimmed); trimmed += group_trimmed; + if (ret == -ERESTARTSYS || ret == -EINTR) + break; if (ret) { dev_failed++; if (!dev_ret) @@ -6701,6 +6710,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) "failed to trim %llu device(s), first error %d", dev_failed, dev_ret); range->len = trimmed; + if (ret == -ERESTARTSYS || ret == -EINTR) + return ret; if (bg_ret) return bg_ret; return dev_ret; From b291ad4458df8311626dfa0a089918f6a542d6bc Mon Sep 17 00:00:00 2001 From: jinbaohong Date: Wed, 28 Jan 2026 07:06:41 +0000 Subject: [PATCH 122/137] btrfs: fix transaction commit blocking during trim of unallocated space When trimming unallocated space, btrfs_trim_fs() holds the device_list_mutex for the entire duration while iterating through all devices. On large filesystems with significant unallocated space, this operation can take minutes to hours on large storage systems. This causes a problem because btrfs_run_dev_stats(), which is called during transaction commit, also requires device_list_mutex: btrfs_trim_fs() mutex_lock(&fs_devices->device_list_mutex) list_for_each_entry(device, ...) btrfs_trim_free_extents(device) mutex_unlock(&fs_devices->device_list_mutex) commit_transaction() btrfs_run_dev_stats() mutex_lock(&fs_devices->device_list_mutex) // blocked! ... While trim is running, all transaction commits are blocked waiting for the mutex. Fix this by refactoring btrfs_trim_free_extents() to process devices in bounded chunks (up to 2GB per iteration) and release device_list_mutex between chunks. Signed-off-by: robbieko Signed-off-by: jinbaohong Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 156 +++++++++++++++++++++++++++++++++++------ fs/btrfs/fs.h | 6 ++ 2 files changed, 140 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 87fd94449f11..03cf9f242c70 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6513,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6 * it while performing the free space search since we have already * held back allocations. */ -static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) +static int btrfs_trim_free_extents_throttle(struct btrfs_device *device, + u64 *trimmed, u64 pos, u64 *ret_next_pos) { - u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; + u64 start = pos; + u64 trim_len = 0; *trimmed = 0; @@ -6536,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) while (1) { struct btrfs_fs_info *fs_info = device->fs_info; + u64 cur_start; + u64 end; + u64 len; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; + cur_start = start; btrfs_find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + start = max(start, cur_start); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { @@ -6570,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) end = min(end, device->total_bytes - 1); len = end - start + 1; + len = min(len, BTRFS_MAX_TRIM_LENGTH); /* We didn't find any extents */ if (!len) { @@ -6590,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; + trim_len += len; + if (trim_len >= BTRFS_MAX_TRIM_LENGTH) { + *ret_next_pos = start; + ret = -EAGAIN; + break; + } if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; @@ -6602,6 +6616,122 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) return ret; } +static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed, + u64 *dev_failed, int *dev_ret) +{ + struct btrfs_device *dev; + struct btrfs_device *working_dev = NULL; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u8 uuid[BTRFS_UUID_SIZE]; + u64 start = BTRFS_DEVICE_RANGE_RESERVED; + + *trimmed = 0; + *dev_failed = 0; + *dev_ret = 0; + + /* Find the device with the smallest UUID to start. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!working_dev || + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) + working_dev = dev; + } + if (working_dev) + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!working_dev) + return 0; + + while (1) { + u64 group_trimmed = 0; + u64 next_pos = 0; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + /* Find and trim the current device. */ + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (dev == working_dev) { + ret = btrfs_trim_free_extents_throttle(working_dev, + &group_trimmed, start, &next_pos); + break; + } + } + + /* Throttle: continue the same device from the new position. */ + if (ret == -EAGAIN && next_pos > start) { + mutex_unlock(&fs_devices->device_list_mutex); + *trimmed += group_trimmed; + start = next_pos; + cond_resched(); + continue; + } + + /* User interrupted. */ + if (ret == -ERESTARTSYS || ret == -EINTR) { + mutex_unlock(&fs_devices->device_list_mutex); + *trimmed += group_trimmed; + return ret; + } + + /* + * Device completed (ret == 0), failed, or EAGAIN with no progress. + * Record error if any, then move to next device. + */ + if (ret == -EAGAIN) { + /* No progress - log and skip device. */ + btrfs_warn(fs_info, + "trim throttle: no progress, offset=%llu device %s, skipping", + start, btrfs_dev_name(working_dev)); + (*dev_failed)++; + if (!*dev_ret) + *dev_ret = ret; + } else if (ret) { + /* Device failed with error. */ + (*dev_failed)++; + if (!*dev_ret) + *dev_ret = ret; + } + + /* + * Find next device: smallest UUID larger than current. + * Devices added during trim with smaller UUID will be skipped. + */ + working_dev = NULL; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + /* Must larger than current UUID. */ + if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0) + continue; + /* Find the smallest. */ + if (!working_dev || + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) + working_dev = dev; + } + if (working_dev) + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); + + mutex_unlock(&fs_devices->device_list_mutex); + + *trimmed += group_trimmed; + start = BTRFS_DEVICE_RANGE_RESERVED; + + /* No more devices. */ + if (!working_dev) + break; + + cond_resched(); + } + + return 0; +} + /* * Trim the whole filesystem by: * 1) trimming the free space in each block group @@ -6613,9 +6743,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; - struct btrfs_device *device; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; @@ -6686,24 +6814,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (ret == -ERESTARTSYS || ret == -EINTR) return ret; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - continue; - - ret = btrfs_trim_free_extents(device, &group_trimmed); - - trimmed += group_trimmed; - if (ret == -ERESTARTSYS || ret == -EINTR) - break; - if (ret) { - dev_failed++; - if (!dev_ret) - dev_ret = ret; - continue; - } - } - mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret); + trimmed += group_trimmed; if (dev_failed) btrfs_warn(fs_info, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index d3762fbe7267..3de3b517810e 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -65,6 +65,12 @@ struct btrfs_space_info; #define BTRFS_MAX_EXTENT_SIZE SZ_128M +/* + * Maximum length to trim in a single iteration to avoid holding device list + * mutex for too long. + */ +#define BTRFS_MAX_TRIM_LENGTH SZ_2G + #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 From b14c5e04bd0f722ed631845599d52d03fcae1bc1 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 29 Jan 2026 16:11:21 -0800 Subject: [PATCH 123/137] btrfs: fix EEXIST abort due to non-consecutive gaps in chunk allocation I have been observing a number of systems aborting at insert_dev_extents() in btrfs_create_pending_block_groups(). The following is a sample stack trace of such an abort coming from forced chunk allocation (typically behind CONFIG_BTRFS_EXPERIMENTAL) but this can theoretically happen to any DUP chunk allocation. [81.801] ------------[ cut here ]------------ [81.801] BTRFS: Transaction aborted (error -17) [81.801] WARNING: fs/btrfs/block-group.c:2876 at btrfs_create_pending_block_groups+0x721/0x770 [btrfs], CPU#1: bash/319 [81.802] Modules linked in: virtio_net btrfs xor zstd_compress raid6_pq null_blk [81.803] CPU: 1 UID: 0 PID: 319 Comm: bash Kdump: loaded Not tainted 6.19.0-rc6+ #319 NONE [81.803] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.17.0-2-2 04/01/2014 [81.804] RIP: 0010:btrfs_create_pending_block_groups+0x723/0x770 [btrfs] [81.806] RSP: 0018:ffffa36241a6bce8 EFLAGS: 00010282 [81.806] RAX: 000000000000000d RBX: ffff8e699921e400 RCX: 0000000000000000 [81.807] RDX: 0000000002040001 RSI: 00000000ffffffef RDI: ffffffffc0608bf0 [81.807] RBP: 00000000ffffffef R08: ffff8e69830f6000 R09: 0000000000000007 [81.808] R10: ffff8e699921e5e8 R11: 0000000000000000 R12: ffff8e6999228000 [81.808] R13: ffff8e6984d82000 R14: ffff8e69966a69c0 R15: ffff8e69aa47b000 [81.809] FS: 00007fec6bdd9740(0000) GS:ffff8e6b1b379000(0000) knlGS:0000000000000000 [81.809] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [81.810] CR2: 00005604833670f0 CR3: 0000000116679000 CR4: 00000000000006f0 [81.810] Call Trace: [81.810] [81.810] __btrfs_end_transaction+0x3e/0x2b0 [btrfs] [81.811] btrfs_force_chunk_alloc_store+0xcd/0x140 [btrfs] [81.811] kernfs_fop_write_iter+0x15f/0x240 [81.812] vfs_write+0x264/0x500 [81.812] ksys_write+0x6c/0xe0 [81.812] do_syscall_64+0x66/0x770 [81.812] entry_SYSCALL_64_after_hwframe+0x76/0x7e [81.813] RIP: 0033:0x7fec6be66197 [81.814] RSP: 002b:00007fffb159dd30 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [81.815] RAX: ffffffffffffffda RBX: 00007fec6bdd9740 RCX: 00007fec6be66197 [81.815] RDX: 0000000000000002 RSI: 0000560483374f80 RDI: 0000000000000001 [81.816] RBP: 0000560483374f80 R08: 0000000000000000 R09: 0000000000000000 [81.816] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000002 [81.817] R13: 00007fec6bfb85c0 R14: 00007fec6bfb5ee0 R15: 00005604833729c0 [81.817] [81.817] irq event stamp: 20039 [81.818] hardirqs last enabled at (20047): [] __up_console_sem+0x52/0x60 [81.818] hardirqs last disabled at (20056): [] __up_console_sem+0x37/0x60 [81.819] softirqs last enabled at (19470): [] __irq_exit_rcu+0x96/0xc0 [81.819] softirqs last disabled at (19463): [] __irq_exit_rcu+0x96/0xc0 [81.820] ---[ end trace 0000000000000000 ]--- [81.820] BTRFS: error (device dm-7 state A) in btrfs_create_pending_block_groups:2876: errno=-17 Object already exists Inspecting these aborts with drgn, I observed a pattern of overlapping chunk_maps. Note how stripe 1 of the first chunk overlaps in physical address with stripe 0 of the second chunk. Physical Start Physical End Length Logical Type Stripe ---------------------------------------------------------------------------------------------------- 0x0000000102500000 0x0000000142500000 1.0G 0x0000000641d00000 META|DUP 0/2 0x0000000142500000 0x0000000182500000 1.0G 0x0000000641d00000 META|DUP 1/2 0x0000000142500000 0x0000000182500000 1.0G 0x0000000601d00000 META|DUP 0/2 0x0000000182500000 0x00000001c2500000 1.0G 0x0000000601d00000 META|DUP 1/2 Now how could this possibly happen? All chunk allocation is protected by the chunk_mutex so racing allocations should see a consistent view of the CHUNK_ALLOCATED bit in the chunk allocation extent-io-tree (device->alloc_state as set by chunk_map_device_set_bits()) The tree itself is protected by a spin lock, and clearing/setting the bits is always protected by fs_info->mapping_tree_lock, so no race is apparent. It turns out that there is a subtle bug in the logic regarding chunk allocations that have happened in the current transaction, known as "pending extents". The chunk allocation as defined in find_free_dev_extent() is a loop which searches the commit root of the dev_root and looks for gaps between DEV_EXTENT items. For those gaps, it then checks alloc_state bitmap for any pending extents and adjusts the hole that it finds accordingly. However, the logic in that adjustment assumes that the first pending extent is the only one in that range. e.g., given a layout with two non-consecutive pending extents in a hole passed to dev_extent_hole_check() via *hole_start and *hole_size: |----pending A----| real hole |----pending B----| | candidate hole | *hole_start *hole_start + *hole_size the code incorrectly returns a "hole" from the end of pending extent A until the passed in hole end, failing to account for pending B. However, it is not entirely obvious that it is actually possible to produce such a layout. I was able to reproduce it, but with some contortions: I continued to use the force chunk allocation sysfs file and I introduced a long delay (10 seconds) into the start of the cleaner thread. I also prevented the unused bgs cleaning logic from ever deleting metadata bgs. These help make it easier to deterministically produce the condition but shouldn't really matter if you imagine the conditions happening by race/luck. Allocations/frees can happen concurrently with the cleaner thread preparing to process an unused extent and both create some used chunks with an unused chunk interleaved, all during one transaction. Then btrfs_delete_unused_bgs() sees the unused one and clears it, leaving a range with several pending chunk allocations and a gap in the middle. The basic idea is that the unused_bgs cleanup work happens on a worker so if we allocate 3 block groups in one transaction, then the cleaner work kicked off by the previous transaction comes through and deletes the middle one of the 3, then the commit root shows no dev extents and we have the bad pattern in the extent-io-tree. One final consideration is that the code happens to loop to the next hole if there are no more extents at all, so we need one more dev extent way past the area we are working in. Something like the following demonstrates the technique: # push the BG frontier out to 20G fallocate -l 20G $mnt/foo # allocate one more that will prevent the "no more dev extents" luck fallocate -l 1G $mnt/sticky # sync sync # clear out the allocation area rm $mnt/foo sync _cleaner # let everything quiesce sleep 20 sync # dev tree should have one bg 20G out and the rest at the beginning.. # sort of like an empty FS but with a random sticky chunk. # kick off the cleaner in the background, remember it will sleep 10s # before doing interesting work _cleaner & sleep 3 # create 3 trivial block groups, all empty, all immediately marked as unused. echo 1 > "$(_btrfs_sysfs_space_info $dev metadata)/force_chunk_alloc" echo 1 > "$(_btrfs_sysfs_space_info $dev data)/force_chunk_alloc" echo 1 > "$(_btrfs_sysfs_space_info $dev metadata)/force_chunk_alloc" # let the cleaner thread definitely finish, it will remove the data bg sleep 10 # this allocation sees the non-consecutive pending metadata chunks with # data chunk gap of 1G and allocates a 2G extent in that hole. ENOSPC! echo 1 > "$(_btrfs_sysfs_space_info $dev metadata)/force_chunk_alloc" As for the fix, it is not that obvious. I could not see a trivial way to do it even by adding backup loops into find_free_dev_extent(), so I opted to change the semantics of dev_extent_hole_check() to not stop looping until it finds a sufficiently big hole. For clarity, this also required changing the helper function contains_pending_extent() into two new helpers which find the first pending extent and the first suitable hole in a range. I attempted to clean up the documentation and range calculations to be as consistent and clear as possible for the future. I also looked at the zoned case and concluded that the loop there is different and not to be unified with this one. As far as I can tell, the zoned check will only further constrain the hole so looping back to find more holes is acceptable. Though given that zoned really only appends, I find it highly unlikely that it is susceptible to this bug. Fixes: 1b9845081633 ("Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole") Reported-by: Dimitrios Apostolou Closes: https://lore.kernel.org/linux-btrfs/q7760374-q1p4-029o-5149-26p28421s468@tzk.arg/ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 243 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 183 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d33780082b8d..329a922893b4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1509,30 +1509,158 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, } /* - * Try to find a chunk that intersects [start, start + len] range and when one - * such is found, record the end of it in *start + * Find the first pending extent intersecting a range. + * + * @device: the device to search + * @start: start of the range to check + * @len: length of the range to check + * @pending_start: output pointer for the start of the found pending extent + * @pending_end: output pointer for the end of the found pending extent (inclusive) + * + * Search for a pending chunk allocation that intersects the half-open range + * [start, start + len). + * + * Return: true if a pending extent was found, false otherwise. + * If the return value is true, store the first pending extent in + * [*pending_start, *pending_end]. Otherwise, the two output variables + * may still be modified, to something outside the range and should not + * be used. */ -static bool contains_pending_extent(struct btrfs_device *device, u64 *start, - u64 len) +static bool first_pending_extent(struct btrfs_device *device, u64 start, u64 len, + u64 *pending_start, u64 *pending_end) { - u64 physical_start, physical_end; - lockdep_assert_held(&device->fs_info->chunk_mutex); - if (btrfs_find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, + if (btrfs_find_first_extent_bit(&device->alloc_state, start, + pending_start, pending_end, CHUNK_ALLOCATED, NULL)) { - if (in_range(physical_start, *start, len) || - in_range(*start, physical_start, - physical_end + 1 - physical_start)) { - *start = physical_end + 1; + if (in_range(*pending_start, start, len) || + in_range(start, *pending_start, *pending_end + 1 - *pending_start)) { return true; } } return false; } +/* + * Find the first real hole accounting for pending extents. + * + * @device: the device containing the candidate hole + * @start: input/output pointer for the hole start position + * @len: input/output pointer for the hole length + * @min_hole_size: the size of hole we are looking for + * + * Given a potential hole specified by [*start, *start + *len), check for pending + * chunk allocations within that range. If pending extents are found, the hole is + * adjusted to represent the first true free space that is large enough when + * accounting for pending chunks. + * + * Note that this function must handle various cases involving non consecutive + * pending extents. + * + * Returns: true if a suitable hole was found and false otherwise. + * If the return value is true, then *start and *len are set to represent the hole. + * If the return value is false, then *start is set to the largest hole we + * found and *len is set to its length. + * If there are no holes at all, then *start is set to the end of the range and + * *len is set to 0. + */ +static bool find_hole_in_pending_extents(struct btrfs_device *device, u64 *start, + u64 *len, u64 min_hole_size) +{ + u64 pending_start, pending_end; + u64 end; + u64 max_hole_start = 0; + u64 max_hole_len = 0; + + lockdep_assert_held(&device->fs_info->chunk_mutex); + + if (*len == 0) + return false; + + end = *start + *len - 1; + + /* + * Loop until we either see a large enough hole or check every pending + * extent overlapping the candidate hole. + * At every hole that we observe, record it if it is the new max. + * At the end of the iteration, set the output variables to the max hole. + */ + while (true) { + if (first_pending_extent(device, *start, *len, &pending_start, &pending_end)) { + /* + * Case 1: the pending extent overlaps the start of + * candidate hole. That means the true hole is after the + * pending extent, but we need to find the next pending + * extent to properly size the hole. In the next loop, + * we will reduce to case 2 or 3. + * e.g., + * + * |----pending A----| real hole |----pending B----| + * | candidate hole | + * *start end + */ + if (pending_start <= *start) { + *start = pending_end + 1; + goto next; + } + /* + * Case 2: The pending extent starts after *start (and overlaps + * [*start, end), so the first hole just goes up to the start + * of the pending extent. + * e.g., + * + * | real hole |----pending A----| + * | candidate hole | + * *start end + */ + *len = pending_start - *start; + if (*len > max_hole_len) { + max_hole_start = *start; + max_hole_len = *len; + } + if (*len >= min_hole_size) + break; + /* + * If the hole wasn't big enough, then we advance past + * the pending extent and keep looking. + */ + *start = pending_end + 1; + goto next; + } else { + /* + * Case 3: There is no pending extent overlapping the + * range [*start, *start + *len - 1], so the only remaining + * hole is the remaining range. + * e.g., + * + * | candidate hole | + * | real hole | + * *start end + */ + + if (*len > max_hole_len) { + max_hole_start = *start; + max_hole_len = *len; + } + break; + } +next: + if (*start > end) + break; + *len = end - *start + 1; + } + if (max_hole_len) { + *start = max_hole_start; + *len = max_hole_len; + } else { + *start = end + 1; + *len = 0; + } + return max_hole_len >= min_hole_size; +} + static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { @@ -1597,59 +1725,57 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, } /* - * Check if specified hole is suitable for allocation. + * Validate and adjust a hole for chunk allocation * - * @device: the device which we have the hole - * @hole_start: starting position of the hole - * @hole_size: the size of the hole - * @num_bytes: the size of the free space that we need + * @device: the device containing the candidate hole + * @hole_start: input/output pointer for the hole start position + * @hole_size: input/output pointer for the hole size + * @num_bytes: minimum allocation size required * - * This function may modify @hole_start and @hole_size to reflect the suitable - * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + * Check if the specified hole is suitable for allocation and adjust it if + * necessary. The hole may be modified to skip over pending chunk allocations + * and to satisfy stricter zoned requirements on zoned filesystems. + * + * For regular (non-zoned) allocation, if the hole after adjustment is smaller + * than @num_bytes, the search continues past additional pending extents until + * either a sufficiently large hole is found or no more pending extents exist. + * + * Return: true if a suitable hole was found and false otherwise. + * If the return value is true, then *hole_start and *hole_size are set to + * represent the hole we found. + * If the return value is false, then *hole_start is set to the largest + * hole we found and *hole_size is set to its length. + * If there are no holes at all, then *hole_start is set to the end of the range + * and *hole_size is set to 0. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, u64 *hole_size, u64 num_bytes) { - bool changed = false; - u64 hole_end = *hole_start + *hole_size; + bool found = false; + const u64 hole_end = *hole_start + *hole_size - 1; - for (;;) { - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + ASSERT(*hole_size > 0); - switch (device->fs_devices->chunk_alloc_policy) { - default: - btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); - fallthrough; - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ - break; - case BTRFS_CHUNK_ALLOC_ZONED: - if (dev_extent_hole_check_zoned(device, hole_start, - hole_size, num_bytes)) { - changed = true; - /* - * The changed hole can contain pending extent. - * Loop again to check that. - */ - continue; - } - break; - } +again: + *hole_size = hole_end - *hole_start + 1; + found = find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes); + if (!found) + return found; + ASSERT(*hole_size >= num_bytes); + switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; + case BTRFS_CHUNK_ALLOC_REGULAR: + return found; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes)) + goto again; break; } - return changed; + return found; } /* @@ -1708,7 +1834,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, ret = -ENOMEM; goto out; } -again: + if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -ENOSPC; @@ -1795,11 +1921,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, */ if (search_end > search_start) { hole_size = search_end - search_start; - if (dev_extent_hole_check(device, &search_start, &hole_size, - num_bytes)) { - btrfs_release_path(path); - goto again; - } + dev_extent_hole_check(device, &search_start, &hole_size, num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -5022,6 +5144,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 diff; u64 start; u64 free_diff = 0; + u64 pending_start, pending_end; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; @@ -5067,7 +5190,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) * in-memory chunks are synced to disk so that the loop below sees them * and relocates them accordingly. */ - if (contains_pending_extent(device, &start, diff)) { + if (first_pending_extent(device, start, diff, &pending_start, &pending_end)) { mutex_unlock(&fs_info->chunk_mutex); ret = btrfs_commit_transaction(trans); if (ret) From 5341c98450df7cf8dacc907a80e3362f3155c847 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 29 Jan 2026 16:11:22 -0800 Subject: [PATCH 124/137] btrfs: tests: add unit tests for pending extent walking functions I ran into another sort of trivial bug in v1 of the patch and concluded that these functions really ought to be unit tested. These two functions form the core of searching the chunk allocation pending extent bitmap and have relatively easily definable semantics, so unit testing them can help ensure the correctness of chunk allocation. I also made a minor unrelated fix in volumes.h to properly forward declare btrfs_space_info. Because of the order of the includes in the new test, this was actually hitting a latent build warning. Note: This is an early example for me of a commit authored in part by an AI agent, so I wanted to more clear about what I did. I defined a trivial test and explained the set of tests I wanted to the agent and it produced the large set of test cases seen here. I then checked each test case to make sure it matched the description and simplified the constants and numbers until they looked reasonable to me. I then checked the looping logic to make sure it made sense to the original spirit of the trivial test. Finally, carefully combed over all the lines it wrote to loop over the tests it generated to make sure they followed our code style guide. Assisted-by: Claude:claude-opus-4-5 Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/Makefile | 3 +- fs/btrfs/tests/btrfs-tests.c | 3 + fs/btrfs/tests/btrfs-tests.h | 1 + fs/btrfs/tests/chunk-allocation-tests.c | 476 ++++++++++++++++++++++++ fs/btrfs/volumes.c | 14 +- fs/btrfs/volumes.h | 6 + 6 files changed, 495 insertions(+), 8 deletions(-) create mode 100644 fs/btrfs/tests/chunk-allocation-tests.c diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 743d7677b175..975104b74486 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -44,4 +44,5 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ tests/free-space-tree-tests.o tests/extent-map-tests.o \ - tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \ + tests/chunk-allocation-tests.o diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b576897d71cc..7f13c05d3736 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -301,6 +301,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_delayed_refs(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_chunk_allocation(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 4307bdaa6749..b0e4b98bdc3d 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -45,6 +45,7 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); +int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tests/chunk-allocation-tests.c b/fs/btrfs/tests/chunk-allocation-tests.c new file mode 100644 index 000000000000..9beb0602fc8c --- /dev/null +++ b/fs/btrfs/tests/chunk-allocation-tests.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 Meta. All rights reserved. + */ + +#include +#include "btrfs-tests.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../extent-io-tree.h" + +/* + * Tests for chunk allocator pending extent internals. + * These two functions form the core of searching the chunk allocation pending + * extent bitmap and have relatively easily definable semantics, so unit + * testing them can help ensure the correctness of chunk allocation. + */ + +/* + * Describes the inputs to the system and expected results + * when testing btrfs_find_hole_in_pending_extents(). + */ +struct pending_extent_test_case { + const char *name; + /* Input range to search. */ + u64 hole_start; + u64 hole_len; + /* The size of hole we are searching for. */ + u64 min_hole_size; + /* + * Pending extents to set up (up to 2 for up to 3 holes) + * If len == 0, then it is skipped. + */ + struct { + u64 start; + u64 len; + } pending_extents[2]; + /* Expected outputs. */ + bool expected_found; + u64 expected_start; + u64 expected_len; +}; + +static const struct pending_extent_test_case find_hole_tests[] = { + { + .name = "no pending extents", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { }, + .expected_found = true, + .expected_start = 0, + .expected_len = 10ULL * SZ_1G, + }, + { + .name = "pending extent at start of range", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 0, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = SZ_1G, + .expected_len = 9ULL * SZ_1G, + }, + { + .name = "pending extent overlapping start of range", + .hole_start = SZ_1G, + .hole_len = 9ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 0, .len = SZ_2G }, + }, + .expected_found = true, + .expected_start = SZ_2G, + .expected_len = 8ULL * SZ_1G, + }, + { + .name = "two holes; first hole is exactly big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = SZ_1G, + }, + { + .name = "two holes; first hole is big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = SZ_2G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = SZ_2G, + }, + { + .name = "two holes; second hole is big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = SZ_2G, + .expected_len = 8ULL * SZ_1G, + }, + { + .name = "three holes; first hole big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_2G, .len = SZ_1G }, + { .start = 4ULL * SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = SZ_2G, + }, + { + .name = "three holes; second hole big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 5ULL * SZ_1G, .len = SZ_1G }, + }, + .expected_found = true, + .expected_start = SZ_2G, + .expected_len = 3ULL * SZ_1G, + }, + { + .name = "three holes; third hole big enough", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = true, + .expected_start = 8ULL * SZ_1G, + .expected_len = SZ_2G, + }, + { + .name = "three holes; all holes too small", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_2G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 3ULL * SZ_1G, .len = 6ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 0, + .expected_len = SZ_1G, + }, + { + .name = "three holes; all holes too small; first biggest", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = 3ULL * SZ_1G, + .pending_extents = { + { .start = SZ_2G, .len = SZ_1G }, + { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 0, + .expected_len = SZ_2G, + }, + { + .name = "three holes; all holes too small; second biggest", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = 3ULL * SZ_1G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = SZ_2G, + .expected_len = SZ_2G, + }, + { + .name = "three holes; all holes too small; third biggest", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = 3ULL * SZ_1G, + .pending_extents = { + { .start = SZ_1G, .len = SZ_1G }, + { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 8ULL * SZ_1G, + .expected_len = SZ_2G, + }, + { + .name = "hole entirely allocated by pending", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 0, .len = 10ULL * SZ_1G }, + }, + .expected_found = false, + .expected_start = 10ULL * SZ_1G, + .expected_len = 0, + }, + { + .name = "pending extent at end of range", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .min_hole_size = SZ_1G, + .pending_extents = { + { .start = 9ULL * SZ_1G, .len = SZ_2G }, + }, + .expected_found = true, + .expected_start = 0, + .expected_len = 9ULL * SZ_1G, + }, + { + .name = "zero length input", + .hole_start = SZ_1G, + .hole_len = 0, + .min_hole_size = SZ_1G, + .pending_extents = { }, + .expected_found = false, + .expected_start = SZ_1G, + .expected_len = 0, + }, +}; + +static int test_find_hole_in_pending(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_device *device; + int ret = 0; + + test_msg("running find_hole_in_pending_extents tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + device = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(device)) { + test_err("failed to allocate dummy device"); + ret = PTR_ERR(device); + goto out_free_fs_info; + } + device->fs_info = fs_info; + + for (int i = 0; i < ARRAY_SIZE(find_hole_tests); i++) { + const struct pending_extent_test_case *test_case = &find_hole_tests[i]; + u64 hole_start = test_case->hole_start; + u64 hole_len = test_case->hole_len; + bool found; + + for (int j = 0; j < ARRAY_SIZE(test_case->pending_extents); j++) { + u64 start = test_case->pending_extents[j].start; + u64 len = test_case->pending_extents[j].len; + + if (!len) + continue; + btrfs_set_extent_bit(&device->alloc_state, + start, start + len - 1, + CHUNK_ALLOCATED, NULL); + } + + mutex_lock(&fs_info->chunk_mutex); + found = btrfs_find_hole_in_pending_extents(device, &hole_start, &hole_len, + test_case->min_hole_size); + mutex_unlock(&fs_info->chunk_mutex); + + if (found != test_case->expected_found) { + test_err("%s: expected found=%d, got found=%d", + test_case->name, test_case->expected_found, found); + ret = -EINVAL; + goto out_clear_pending_extents; + } + if (hole_start != test_case->expected_start || + hole_len != test_case->expected_len) { + test_err("%s: expected [%llu, %llu), got [%llu, %llu)", + test_case->name, test_case->expected_start, + test_case->expected_start + + test_case->expected_len, + hole_start, hole_start + hole_len); + ret = -EINVAL; + goto out_clear_pending_extents; + } +out_clear_pending_extents: + btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1, + CHUNK_ALLOCATED, NULL); + if (ret) + break; + } + +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +/* + * Describes the inputs to the system and expected results + * when testing btrfs_first_pending_extent(). + */ +struct first_pending_test_case { + const char *name; + /* The range to look for a pending extent in. */ + u64 hole_start; + u64 hole_len; + /* The pending extent to look for. */ + struct { + u64 start; + u64 len; + } pending_extent; + /* Expected outputs. */ + bool expected_found; + u64 expected_pending_start; + u64 expected_pending_end; +}; + +static const struct first_pending_test_case first_pending_tests[] = { + { + .name = "no pending extent", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .pending_extent = { 0, 0 }, + .expected_found = false, + }, + { + .name = "pending extent at search start", + .hole_start = SZ_1G, + .hole_len = 9ULL * SZ_1G, + .pending_extent = { SZ_1G, SZ_1G }, + .expected_found = true, + .expected_pending_start = SZ_1G, + .expected_pending_end = SZ_2G - 1, + }, + { + .name = "pending extent overlapping search start", + .hole_start = SZ_1G, + .hole_len = 9ULL * SZ_1G, + .pending_extent = { 0, SZ_2G }, + .expected_found = true, + .expected_pending_start = 0, + .expected_pending_end = SZ_2G - 1, + }, + { + .name = "pending extent inside search range", + .hole_start = 0, + .hole_len = 10ULL * SZ_1G, + .pending_extent = { SZ_2G, SZ_1G }, + .expected_found = true, + .expected_pending_start = SZ_2G, + .expected_pending_end = 3ULL * SZ_1G - 1, + }, + { + .name = "pending extent outside search range", + .hole_start = 0, + .hole_len = SZ_1G, + .pending_extent = { SZ_2G, SZ_1G }, + .expected_found = false, + }, + { + .name = "pending extent overlapping end of search range", + .hole_start = 0, + .hole_len = SZ_2G, + .pending_extent = { SZ_1G, SZ_2G }, + .expected_found = true, + .expected_pending_start = SZ_1G, + .expected_pending_end = 3ULL * SZ_1G - 1, + }, +}; + +static int test_first_pending_extent(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_device *device; + int ret = 0; + + test_msg("running first_pending_extent tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + device = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(device)) { + test_err("failed to allocate dummy device"); + ret = PTR_ERR(device); + goto out_free_fs_info; + } + + device->fs_info = fs_info; + + for (int i = 0; i < ARRAY_SIZE(first_pending_tests); i++) { + const struct first_pending_test_case *test_case = &first_pending_tests[i]; + u64 start = test_case->pending_extent.start; + u64 len = test_case->pending_extent.len; + u64 pending_start, pending_end; + bool found; + + if (len) { + btrfs_set_extent_bit(&device->alloc_state, + start, start + len - 1, + CHUNK_ALLOCATED, NULL); + } + + mutex_lock(&fs_info->chunk_mutex); + found = btrfs_first_pending_extent(device, test_case->hole_start, + test_case->hole_len, + &pending_start, &pending_end); + mutex_unlock(&fs_info->chunk_mutex); + + if (found != test_case->expected_found) { + test_err("%s: expected found=%d, got found=%d", + test_case->name, test_case->expected_found, found); + ret = -EINVAL; + goto out_clear_pending_extents; + } + if (!found) + goto out_clear_pending_extents; + + if (pending_start != test_case->expected_pending_start || + pending_end != test_case->expected_pending_end) { + test_err("%s: expected pending [%llu, %llu], got [%llu, %llu]", + test_case->name, + test_case->expected_pending_start, + test_case->expected_pending_end, + pending_start, pending_end); + ret = -EINVAL; + goto out_clear_pending_extents; + } + +out_clear_pending_extents: + btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1, + CHUNK_ALLOCATED, NULL); + if (ret) + break; + } + +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + +int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize) +{ + int ret; + + test_msg("running chunk allocation tests"); + + ret = test_first_pending_extent(sectorsize, nodesize); + if (ret) + return ret; + + ret = test_find_hole_in_pending(sectorsize, nodesize); + if (ret) + return ret; + + return 0; +} diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 329a922893b4..f281d113519b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1526,8 +1526,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, * may still be modified, to something outside the range and should not * be used. */ -static bool first_pending_extent(struct btrfs_device *device, u64 start, u64 len, - u64 *pending_start, u64 *pending_end) +bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len, + u64 *pending_start, u64 *pending_end) { lockdep_assert_held(&device->fs_info->chunk_mutex); @@ -1566,8 +1566,8 @@ static bool first_pending_extent(struct btrfs_device *device, u64 start, u64 len * If there are no holes at all, then *start is set to the end of the range and * *len is set to 0. */ -static bool find_hole_in_pending_extents(struct btrfs_device *device, u64 *start, - u64 *len, u64 min_hole_size) +bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, u64 *start, + u64 *len, u64 min_hole_size) { u64 pending_start, pending_end; u64 end; @@ -1588,7 +1588,7 @@ static bool find_hole_in_pending_extents(struct btrfs_device *device, u64 *start * At the end of the iteration, set the output variables to the max hole. */ while (true) { - if (first_pending_extent(device, *start, *len, &pending_start, &pending_end)) { + if (btrfs_first_pending_extent(device, *start, *len, &pending_start, &pending_end)) { /* * Case 1: the pending extent overlaps the start of * candidate hole. That means the true hole is after the @@ -1758,7 +1758,7 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, again: *hole_size = hole_end - *hole_start + 1; - found = find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes); + found = btrfs_find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes); if (!found) return found; ASSERT(*hole_size >= num_bytes); @@ -5190,7 +5190,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) * in-memory chunks are synced to disk so that the loop below sees them * and relocates them accordingly. */ - if (first_pending_extent(device, start, diff, &pending_start, &pending_end)) { + if (btrfs_first_pending_extent(device, start, diff, &pending_start, &pending_end)) { mutex_unlock(&fs_info->chunk_mutex); ret = btrfs_commit_transaction(trans); if (ret) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e4644352314a..ebc85bf53ee7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -30,6 +30,7 @@ struct btrfs_block_group; struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_zoned_device_info; +struct btrfs_space_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -892,6 +893,11 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits); +bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len, + u64 *pending_start, u64 *pending_end); +bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, + u64 *start, u64 *len, u64 min_hole_size); + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, u64 logical, u16 total_stripes); From a56a70f8d20752c8be032ac03f7e8684dccdeb58 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 30 Jan 2026 17:06:45 +0000 Subject: [PATCH 125/137] btrfs: raid56: fix memory leak of btrfs_raid_bio::stripe_uptodate_bitmap We allocate the bitmap but we never free it in free_raid_bio_pointers(). Fix this by adding a bitmap_free() call against the stripe_uptodate_bitmap of a raid bio. Fixes: 1810350b04ef ("btrfs: raid56: move sector_ptr::uptodate into a dedicated bitmap") Reported-by: Christoph Hellwig Link: https://lore.kernel.org/linux-btrfs/20260126045315.GA31641@lst.de/ Reviewed-by: Qu Wenruo Tested-by: Christoph Hellwig Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f38d8305e46d..baadaaa189c0 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -150,6 +150,7 @@ static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); + bitmap_free(rbio->stripe_uptodate_bitmap); kfree(rbio->stripe_pages); kfree(rbio->bio_paddrs); kfree(rbio->stripe_paddrs); From 392431cf9c2140b4356597f506f3e1e324e2ccfc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 26 Jan 2026 14:49:50 +0900 Subject: [PATCH 126/137] btrfs: tests: add cleanup functions for test specific functions Add auto-cleanup helper functions for btrfs_free_dummy_fs_info and btrfs_free_dummy_block_group. Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/btrfs-tests.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b0e4b98bdc3d..b03d85a6e5ef 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -7,8 +7,10 @@ #define BTRFS_TESTS_H #include +#include #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) @@ -49,10 +51,14 @@ int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); +DEFINE_FREE(btrfs_free_dummy_fs_info, struct btrfs_fs_info *, + btrfs_free_dummy_fs_info(_T)) void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); +DEFINE_FREE(btrfs_free_dummy_block_group, struct btrfs_block_group *, + btrfs_free_dummy_block_group(_T)); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); From e564cd2511750a634f916ae406d1f6ff84e53d0d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 26 Jan 2026 14:49:51 +0900 Subject: [PATCH 127/137] btrfs: add cleanup function for btrfs_free_chunk_map Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ebc85bf53ee7..8288d79372a5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -634,6 +634,7 @@ static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) kfree(map); } } +DEFINE_FREE(btrfs_free_chunk_map, struct btrfs_chunk_map *, btrfs_free_chunk_map(_T)) struct btrfs_balance_control { struct btrfs_balance_args data; From e8f6130419d7a8b1384135a9e23d008c3fc01dad Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 26 Jan 2026 14:49:52 +0900 Subject: [PATCH 128/137] btrfs: zoned: factor out the zone loading part into a testable function Separate btrfs_load_block_group_* calling path into a function, so that it can be an entry point of unit test. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 105 +++++++++++++++++++++++++---------------------- fs/btrfs/zoned.h | 9 ++++ 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b792136e3d08..ad8621587fd2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1823,6 +1823,62 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return 0; } +EXPORT_FOR_TESTS +int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active, u64 last_alloc) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + u64 profile; + int ret; + + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + switch (profile) { + case 0: /* single */ + ret = btrfs_load_block_group_single(bg, &zone_info[0], active); + break; + case BTRFS_BLOCK_GROUP_DUP: + ret = btrfs_load_block_group_dup(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + ret = btrfs_load_block_group_raid1(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID0: + ret = btrfs_load_block_group_raid0(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID10: + ret = btrfs_load_block_group_raid10(bg, map, zone_info, active, last_alloc); + break; + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && + profile != BTRFS_BLOCK_GROUP_RAID10) { + /* + * Detected broken write pointer. Make this block group + * unallocatable by setting the allocation pointer at the end of + * allocatable region. Relocating this block group will fix the + * mismatch. + * + * Currently, we cannot handle RAID0 or RAID10 case like this + * because we don't have a proper zone_capacity value. But, + * reading from this block group won't work anyway by a missing + * stripe. + */ + bg->alloc_offset = bg->zone_capacity; + } + + return ret; +} + int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -1835,7 +1891,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; - u64 profile; if (!btrfs_is_zoned(fs_info)) return 0; @@ -1895,53 +1950,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } - profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; - switch (profile) { - case 0: /* single */ - ret = btrfs_load_block_group_single(cache, &zone_info[0], active); - break; - case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active, - last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID1: - case BTRFS_BLOCK_GROUP_RAID1C3: - case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, - active, last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, - active, last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, - active, last_alloc); - break; - case BTRFS_BLOCK_GROUP_RAID5: - case BTRFS_BLOCK_GROUP_RAID6: - default: - btrfs_err(fs_info, "zoned: profile %s not yet supported", - btrfs_bg_type_to_raid_name(map->type)); - ret = -EINVAL; - goto out; - } - - if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && - profile != BTRFS_BLOCK_GROUP_RAID10) { - /* - * Detected broken write pointer. Make this block group - * unallocatable by setting the allocation pointer at the end of - * allocatable region. Relocating this block group will fix the - * mismatch. - * - * Currently, we cannot handle RAID0 or RAID10 case like this - * because we don't have a proper zone_capacity value. But, - * reading from this block group won't work anyway by a missing - * stripe. - */ - cache->alloc_offset = cache->zone_capacity; - } + ret = btrfs_load_block_group_by_raid_type(cache, map, zone_info, active, last_alloc); out: /* Reject non SINGLE data profiles without RST */ diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 2fdc88c6fa3c..8e21a836f858 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -99,6 +99,15 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct zone_info; + +int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg, + struct btrfs_chunk_map *map, + struct zone_info *zone_info, + unsigned long *active, u64 last_alloc); +#endif + #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) From 3be8a788eed3f7f30f32d69f50d648ba2c458f21 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:38 +1030 Subject: [PATCH 129/137] btrfs: lzo: introduce lzo_compress_bio() helper The new helper has the following enhancements against the existing lzo_compress_folios() - Much smaller parameter list No more shared IN/OUT members, no need to pre-allocate a compressed_folios[] array. Just a workspace list header and a compressed_bio pointer. Everything else can be fetched from that @cb pointer. - Read-to-be-submitted compressed bio Although the caller still needs to do some common works like rounding up and zeroing the tailing part of the last fs block. Some workloads are specific to lZO that is not needed with other multi-run compression interfaces: - Need to write a LZO header or segment header Use the new write_and_queue_folio() helper to do the bio_add_folio() call and folio switching. - Need to update the LZO header after compression is done Use bio_first_folio_all() to grab the first folio and update the header. - Extra corner case of error handling This can happen when we have queued part of a folio and hit an error. In that case those folios will be released by the bio. Thus we can only release the folio that has no queued part. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 1 + fs/btrfs/lzo.c | 258 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index e0228017e861..4b63d7e4a9ad 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -161,6 +161,7 @@ struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); +int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index bd5ee82080fa..96e408add9b8 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -214,6 +214,157 @@ static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, return 0; } +/* + * Write data into @out_folio and queue it into @out_bio. + * + * Return 0 if everything is fine and @total_out will be increased. + * Return <0 for error. + * + * The @out_folio can be NULL after a full folio is queued. + * Thus the caller should check and allocate a new folio when needed. + */ +static int write_and_queue_folio(struct bio *out_bio, struct folio **out_folio, + u32 *total_out, u32 write_len) +{ + const u32 fsize = folio_size(*out_folio); + const u32 foffset = offset_in_folio(*out_folio, *total_out); + + ASSERT(out_folio && *out_folio); + /* Should not cross folio boundary. */ + ASSERT(foffset + write_len <= fsize); + + /* We can not use bio_add_folio_nofail() which doesn't do any merge. */ + if (!bio_add_folio(out_bio, *out_folio, write_len, foffset)) { + /* + * We have allocated a bio that havs BTRFS_MAX_COMPRESSED_PAGES + * vecs, and all ranges inside the same folio should have been + * merged. If bio_add_folio() still failed, that means we have + * reached the bvec limits. + * + * This should only happen at the beginning of a folio, and + * caller is responsible for releasing the folio, since it's + * not yet queued into the bio. + */ + ASSERT(IS_ALIGNED(*total_out, fsize)); + return -E2BIG; + } + + *total_out += write_len; + /* + * The full folio has been filled and queued, reset @out_folio to NULL, + * so that error handling is fully handled by the bio. + */ + if (IS_ALIGNED(*total_out, fsize)) + *out_folio = NULL; + return 0; +} + +/* + * Copy compressed data to bio. + * + * @out_bio: The bio that will contain all the compressed data. + * @compressed_data: The compressed data of this segment. + * @compressed_size: The size of the compressed data. + * @out_folio: The current output folio, will be updated if a new + * folio is allocated. + * @total_out: The total bytes of current output. + * @max_out: The maximum size of the compressed data. + * + * Will do: + * + * - Write a segment header into the destination + * - Copy the compressed buffer into the destination + * - Make sure we have enough space in the last sector to fit a segment header + * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. + * - If a full folio is filled, it will be queued into @out_bio, and @out_folio + * will be updated. + * + * Will allocate new pages when needed. + */ +static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, + struct bio *out_bio, + const char *compressed_data, + size_t compressed_size, + struct folio **out_folio, + u32 *total_out, u32 max_out) +{ + const u32 sectorsize = fs_info->sectorsize; + const u32 sectorsize_bits = fs_info->sectorsize_bits; + const u32 fsize = btrfs_min_folio_size(fs_info); + const u32 old_size = out_bio->bi_iter.bi_size; + u32 copy_start; + u32 sector_bytes_left; + char *kaddr; + int ret; + + ASSERT(out_folio); + + /* There should be at least a lzo header queued. */ + ASSERT(old_size); + ASSERT(old_size == *total_out); + + /* + * We never allow a segment header crossing sector boundary, previous + * run should ensure we have enough space left inside the sector. + */ + ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); + + if (!*out_folio) { + *out_folio = btrfs_alloc_compr_folio(fs_info); + if (!*out_folio) + return -ENOMEM; + } + + /* Write the segment header first. */ + kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out)); + write_compress_length(kaddr, compressed_size); + kunmap_local(kaddr); + ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN); + if (ret < 0) + return ret; + + copy_start = *total_out; + + /* Copy compressed data. */ + while (*total_out - copy_start < compressed_size) { + u32 copy_len = min_t(u32, sectorsize - *total_out % sectorsize, + copy_start + compressed_size - *total_out); + u32 foffset = *total_out & (fsize - 1); + + /* With the range copied, we're larger than the original range. */ + if (((*total_out + copy_len) >> sectorsize_bits) >= + max_out >> sectorsize_bits) + return -E2BIG; + + if (!*out_folio) { + *out_folio = btrfs_alloc_compr_folio(fs_info); + if (!*out_folio) + return -ENOMEM; + } + + kaddr = kmap_local_folio(*out_folio, foffset); + memcpy(kaddr, compressed_data + *total_out - copy_start, copy_len); + kunmap_local(kaddr); + ret = write_and_queue_folio(out_bio, out_folio, total_out, copy_len); + if (ret < 0) + return ret; + } + + /* + * Check if we can fit the next segment header into the remaining space + * of the sector. + */ + sector_bytes_left = round_up(*total_out, sectorsize) - *total_out; + if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) + return 0; + + ASSERT(*out_folio); + + /* The remaining size is not enough, pad it with zeros */ + folio_zero_range(*out_folio, offset_in_folio(*out_folio, *total_out), sector_bytes_left); + return write_and_queue_folio(out_bio, out_folio, total_out, sector_bytes_left); +} + int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -310,6 +461,113 @@ int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, return ret; } +int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct btrfs_inode *inode = cb->bbio.inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct bio *bio = &cb->bbio.bio; + const u64 start = cb->start; + const u32 len = cb->len; + const u32 sectorsize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio_in = NULL; + struct folio *folio_out = NULL; + char *sizes_ptr; + int ret = 0; + /* Points to the file offset of input data. */ + u64 cur_in = start; + /* Points to the current output byte. */ + u32 total_out = 0; + + ASSERT(bio->bi_iter.bi_size == 0); + ASSERT(len); + + folio_out = btrfs_alloc_compr_folio(fs_info); + if (!folio_out) + return -ENOMEM; + + /* Queue a segment header first. */ + ret = write_and_queue_folio(bio, &folio_out, &total_out, LZO_LEN); + /* The first header should not fail. */ + ASSERT(ret == 0); + + while (cur_in < start + len) { + char *data_in; + const u32 sectorsize_mask = sectorsize - 1; + u32 sector_off = (cur_in - start) & sectorsize_mask; + u32 in_len; + size_t out_len; + + /* Get the input page first. */ + if (!folio_in) { + ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); + if (ret < 0) + goto out; + } + + /* Compress at most one sector of data each time. */ + in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); + ASSERT(in_len); + data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, + workspace->mem); + kunmap_local(data_in); + if (unlikely(ret < 0)) { + /* lzo1x_1_compress never fails. */ + ret = -EIO; + goto out; + } + + ret = copy_compressed_data_to_bio(fs_info, bio, workspace->cbuf, out_len, + &folio_out, &total_out, len); + if (ret < 0) + goto out; + + cur_in += in_len; + + /* + * Check if we're making it bigger after two sectors. And if + * it is so, give up. + */ + if (cur_in - start > sectorsize * 2 && cur_in - start < total_out) { + ret = -E2BIG; + goto out; + } + + /* Check if we have reached input folio boundary. */ + if (IS_ALIGNED(cur_in, min_folio_size)) { + folio_put(folio_in); + folio_in = NULL; + } + } + /* + * The last folio is already queued. Bio is responsible for freeing + * those folios now. + */ + folio_out = NULL; + + /* Store the size of all chunks of compressed data */ + sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0); + write_compress_length(sizes_ptr, total_out); + kunmap_local(sizes_ptr); +out: + /* + * We can only free the folio that has no part queued into the bio. + * + * As any folio that is already queued into bio will be released by + * the endio function of bio. + */ + if (folio_out && IS_ALIGNED(total_out, min_folio_size)) { + btrfs_free_compr_folio(folio_out); + folio_out = NULL; + } + if (folio_in) + folio_put(folio_in); + return ret; +} + static struct folio *get_current_folio(struct compressed_bio *cb, struct folio_iter *fi, u32 *cur_folio_index, u32 cur_in) { From bba959655ac5665f3ad2fc244c98da48d2ae4c17 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:39 +1030 Subject: [PATCH 130/137] btrfs: zstd: introduce zstd_compress_bio() helper The new helper has the following enhancements against the existing zstd_compress_folios() - Much smaller parameter list No more shared IN/OUT members, no need to pre-allocate a compressed_folios[] array. Just a workspace and compressed_bio pointer, everything we need can be extracted from that @cb pointer. - Ready-to-be-submitted compressed bio Although the caller still needs to do some common works like rounding up and zeroing the tailing part of the last fs block. Overall the workflow is the same as zstd_compress_folios(), but with some minor changes: - @start/@len is now constant For the current input file offset, use @start + @tot_in instead. The original change of @start and @len makes it pretty hard to know what value we're really comparing to. - No more @cur_len It's only utilized when switching input buffer. Directly use btrfs_calc_input_length() instead. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 1 + fs/btrfs/zstd.c | 186 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 4b63d7e4a9ad..454c8e0461b4 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -172,6 +172,7 @@ void lzo_free_workspace(struct list_head *ws); int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); +int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 7fad1e299c7a..135b0b32579f 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -585,6 +585,192 @@ int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, return ret; } +int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct btrfs_inode *inode = cb->bbio.inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct bio *bio = &cb->bbio.bio; + zstd_cstream *stream; + int ret = 0; + /* The current folio to read. */ + struct folio *in_folio = NULL; + /* The current folio to write to. */ + struct folio *out_folio = NULL; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + const u64 start = cb->start; + const u32 len = cb->len; + const u64 end = start + len; + const u32 blocksize = fs_info->sectorsize; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); + + /* Initialize the stream. */ + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); + if (unlikely(!stream)) { + btrfs_err(fs_info, + "zstd compression init level %d failed, root %llu inode %llu offset %llu", + workspace->req_level, btrfs_root_id(inode->root), + btrfs_ino(inode), start); + ret = -EIO; + goto out; + } + + /* Map in the first page of input data. */ + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); + workspace->in_buf.pos = 0; + workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start); + + /* Allocate and map in the output buffer. */ + out_folio = btrfs_alloc_compr_folio(fs_info); + if (out_folio == NULL) { + ret = -ENOMEM; + goto out; + } + workspace->out_buf.dst = folio_address(out_folio); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_folio_size; + + while (1) { + size_t ret2; + + ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); + if (unlikely(zstd_is_error(ret2))) { + btrfs_warn(fs_info, +"zstd compression level %d failed, error %d root %llu inode %llu offset %llu", + workspace->req_level, zstd_get_error_code(ret2), + btrfs_root_id(inode->root), btrfs_ino(inode), + start + tot_in); + ret = -EIO; + goto out; + } + + /* Check to see if we are making it bigger. */ + if (tot_in + workspace->in_buf.pos > blocksize * 2 && + tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { + ret = -E2BIG; + goto out; + } + + /* Check if we need more output space. */ + if (workspace->out_buf.pos >= workspace->out_buf.size) { + tot_out += min_folio_size; + if (tot_out >= len) { + ret = -E2BIG; + goto out; + } + /* Queue the current foliot into the bio. */ + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { + ret = -E2BIG; + goto out; + } + + out_folio = btrfs_alloc_compr_folio(fs_info); + if (out_folio == NULL) { + ret = -ENOMEM; + goto out; + } + workspace->out_buf.dst = folio_address(out_folio); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_folio_size; + } + + /* We've reached the end of the input. */ + if (tot_in + workspace->in_buf.pos >= len) { + tot_in += workspace->in_buf.pos; + break; + } + + /* Check if we need more input. */ + if (workspace->in_buf.pos >= workspace->in_buf.size) { + u64 cur; + + tot_in += workspace->in_buf.size; + cur = start + tot_in; + + kunmap_local(workspace->in_buf.src); + workspace->in_buf.src = NULL; + folio_put(in_folio); + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &in_folio); + if (ret < 0) + goto out; + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_folio(in_folio, cur)); + workspace->in_buf.pos = 0; + workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, cur); + } + } + + while (1) { + size_t ret2; + + ret2 = zstd_end_stream(stream, &workspace->out_buf); + if (unlikely(zstd_is_error(ret2))) { + btrfs_err(fs_info, +"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", + workspace->req_level, zstd_get_error_code(ret2), + btrfs_root_id(inode->root), btrfs_ino(inode), + start + tot_in); + ret = -EIO; + goto out; + } + /* Queue the remaining part of the output folio into bio. */ + if (ret2 == 0) { + tot_out += workspace->out_buf.pos; + if (tot_out >= len) { + ret = -E2BIG; + goto out; + } + if (!bio_add_folio(bio, out_folio, workspace->out_buf.pos, 0)) { + ret = -E2BIG; + goto out; + } + out_folio = NULL; + break; + } + tot_out += min_folio_size; + if (tot_out >= len) { + ret = -E2BIG; + goto out; + } + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { + ret = -E2BIG; + goto out; + } + out_folio = btrfs_alloc_compr_folio(fs_info); + if (out_folio == NULL) { + ret = -ENOMEM; + goto out; + } + workspace->out_buf.dst = folio_address(out_folio); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_folio_size; + } + + if (tot_out >= tot_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + ASSERT(tot_out == bio->bi_iter.bi_size); +out: + if (out_folio) + btrfs_free_compr_folio(out_folio); + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); + folio_put(in_folio); + } + return ret; +} + int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); From 3d74a7556fbab89a3e78f514cf39d3413b9963d1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:40 +1030 Subject: [PATCH 131/137] btrfs: zlib: introduce zlib_compress_bio() helper The new helper has the following enhancements against the existing zlib_compress_folios() - Much smaller parameter list No more shared IN/OUT members, no need to pre-allocate a compressed_folios[] array. Just a workspace and compressed_bio pointer, everything we need can be extracted from that @cb pointer. - Ready-to-be-submitted compressed bio Although the caller still needs to do some common works like rounding up and zeroing the tailing part of the last fs block. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 1 + fs/btrfs/zlib.c | 194 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 454c8e0461b4..eee4190efa02 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -150,6 +150,7 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); +int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index a004aa4ee9e2..fa463e0e8527 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -334,6 +334,200 @@ int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, return ret; } +int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct btrfs_inode *inode = cb->bbio.inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct bio *bio = &cb->bbio.bio; + u64 start = cb->start; + u32 len = cb->len; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + int ret; + char *data_in = NULL; + char *cfolio_out; + struct folio *in_folio = NULL; + struct folio *out_folio = NULL; + const u32 blocksize = fs_info->sectorsize; + const u64 orig_end = start + len; + + ret = zlib_deflateInit(&workspace->strm, workspace->level); + if (unlikely(ret != Z_OK)) { + btrfs_err(fs_info, + "zlib compression init failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); + ret = -EIO; + goto out; + } + + workspace->strm.total_in = 0; + workspace->strm.total_out = 0; + + out_folio = btrfs_alloc_compr_folio(fs_info); + if (out_folio == NULL) { + ret = -ENOMEM; + goto out; + } + cfolio_out = folio_address(out_folio); + + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = 0; + workspace->strm.next_out = cfolio_out; + workspace->strm.avail_out = min_folio_size; + + while (workspace->strm.total_in < len) { + /* + * Get next input pages and copy the contents to the workspace + * buffer if required. + */ + if (workspace->strm.avail_in == 0) { + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * For s390 hardware accelerated zlib, and our folio is smaller + * than the copy_length, we need to fill the buffer so that + * we can take full advantage of hardware acceleration. + */ + if (need_special_buffer(fs_info)) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = copy_length; + } else { + unsigned int cur_len; + + if (data_in) { + kunmap_local(data_in); + folio_put(in_folio); + data_in = NULL; + } + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + data_in = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); + start += cur_len; + workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; + } + } + + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); + if (unlikely(ret != Z_OK)) { + btrfs_warn(fs_info, + "zlib compression failed, error %d root %llu inode %llu offset %llu", + ret, btrfs_root_id(inode->root), btrfs_ino(inode), + start); + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } + + /* We're making it bigger, give up. */ + if (workspace->strm.total_in > blocksize * 2 && + workspace->strm.total_in < workspace->strm.total_out) { + ret = -E2BIG; + goto out; + } + if (workspace->strm.total_out >= len) { + ret = -E2BIG; + goto out; + } + /* Queue the full folio and allocate a new one. */ + if (workspace->strm.avail_out == 0) { + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { + ret = -E2BIG; + goto out; + } + + out_folio = btrfs_alloc_compr_folio(fs_info); + if (out_folio == NULL) { + ret = -ENOMEM; + goto out; + } + cfolio_out = folio_address(out_folio); + workspace->strm.avail_out = min_folio_size; + workspace->strm.next_out = cfolio_out; + } + /* We're all done. */ + if (workspace->strm.total_in >= len) + break; + } + + workspace->strm.avail_in = 0; + + /* + * Call deflate with Z_FINISH flush parameter providing more output + * space but no more input data, until it returns with Z_STREAM_END. + */ + while (ret != Z_STREAM_END) { + ret = zlib_deflate(&workspace->strm, Z_FINISH); + if (ret == Z_STREAM_END) + break; + if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) { + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } else if (workspace->strm.avail_out == 0) { + if (workspace->strm.total_out >= len) { + ret = -E2BIG; + goto out; + } + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { + ret = -E2BIG; + goto out; + } + /* Get another folio for the stream end. */ + out_folio = btrfs_alloc_compr_folio(fs_info); + if (out_folio == NULL) { + ret = -ENOMEM; + goto out; + } + cfolio_out = folio_address(out_folio); + workspace->strm.avail_out = min_folio_size; + workspace->strm.next_out = cfolio_out; + } + } + /* Queue the remaining part of the folio. */ + if (workspace->strm.total_out > bio->bi_iter.bi_size) { + u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); + + if (!bio_add_folio(bio, out_folio, cur_len, 0)) { + ret = -E2BIG; + goto out; + } + } else { + /* The last folio hasn't' been utilized. */ + btrfs_free_compr_folio(out_folio); + } + out_folio = NULL; + ASSERT(bio->bi_iter.bi_size == workspace->strm.total_out); + zlib_deflateEnd(&workspace->strm); + + if (workspace->strm.total_out >= workspace->strm.total_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; +out: + if (out_folio) + btrfs_free_compr_folio(out_folio); + if (data_in) { + kunmap_local(data_in); + folio_put(in_folio); + } + + return ret; +} + int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); From c51173271d528561706a2ce3bacd4f6232f4375b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:41 +1030 Subject: [PATCH 132/137] btrfs: introduce btrfs_compress_bio() helper The helper will allocate a new compressed_bio, do the compression, and return it to the caller. This greatly simplifies the compression path, as we no longer need to allocate a folio array thus no extra error path, furthermore the compressed bio structure can be utilized for submission with very minor modifications (like rounding up the bi_size and populate the bi_sector). Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 68 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/compression.h | 13 ++++++++ 2 files changed, 81 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4c6298cf01b2..6f123ae9a240 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1064,6 +1064,74 @@ int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inod return ret; } +/* + * Given an address space and start and length, compress the page cache + * contents into @cb. + * + * @type_level: is encoded algorithm and level, where level 0 means whatever + * default the algorithm chooses and is opaque here; + * - compression algo are 0-3 + * - the level are bits 4-7 + * + * @cb->bbio.bio.bi_iter.bi_size will indicate the compressed data size. + * The bi_size may not be sectorsize aligned, thus the caller still need + * to do the round up before submission. + * + * This function will allocate compressed folios with btrfs_alloc_compr_folio(), + * thus callers must make sure the endio function and error handling are using + * btrfs_free_compr_folio() to release those folios. + * This is already done in end_bbio_compressed_write() and cleanup_compressed_bio(). + */ +struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode, + u64 start, u32 len, unsigned int type, + int level, blk_opf_t write_flags) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct list_head *workspace; + struct compressed_bio *cb; + int ret; + + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + end_bbio_compressed_write); + cb->start = start; + cb->len = len; + cb->writeback = true; + cb->compress_type = type; + + level = btrfs_compress_set_level(type, level); + workspace = get_workspace(fs_info, type, level); + switch (type) { + case BTRFS_COMPRESS_ZLIB: + ret = zlib_compress_bio(workspace, cb); + break; + case BTRFS_COMPRESS_LZO: + ret = lzo_compress_bio(workspace, cb); + break; + case BTRFS_COMPRESS_ZSTD: + ret = zstd_compress_bio(workspace, cb); + break; + case BTRFS_COMPRESS_NONE: + default: + /* + * This can happen when compression races with remount setting + * it to 'no compress', while caller doesn't call + * inode_need_compress() to check if we really need to + * compress. + * + * Not a big deal, just need to inform caller that we + * haven't allocated any pages yet. + */ + ret = -E2BIG; + } + + put_workspace(fs_info, type, workspace); + if (ret < 0) { + cleanup_compressed_bio(cb); + return ERR_PTR(ret); + } + return cb; +} + static int btrfs_decompress_bio(struct compressed_bio *cb) { struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index eee4190efa02..fd0cce5d07cf 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -146,6 +146,19 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret); +struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode, + u64 start, u32 len, unsigned int type, + int level, blk_opf_t write_flags); + +static inline void cleanup_compressed_bio(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + btrfs_free_compr_folio(fi.folio); + bio_put(bio); +} int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, u64 start, struct folio **folios, unsigned long *out_folios, From 6f706f34fc4cc0ce180c293f9e793302fa00e4d8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:42 +1030 Subject: [PATCH 133/137] btrfs: switch to btrfs_compress_bio() interface for compressed writes This switch has the following benefits: - A single structure to handle all compression No more extra members like compressed_folios[] nor compress_type, all those members. This means the structure of async_extent is much smaller. - Simpler error handling A single cleanup_compressed_bio() will handle everything, no extra compressed_folios[] array to bother. Some extra notes: - Compressed folios releasing Now we go bio_for_each_folio_all() loop to release the folios of the bio. This will work for both the old compressed_folios[] array and the new pure bio method. For old compressed_folios[], all folios of that array is queued into the bio, thus releasing the folios from the bio is the same as releasing each folio of that array. We just need to be sure no double releasing from the array and bio. For the new pure bio method, that array is NULL, just usual folio releasing of the bio. The only extra note is for end_bbio_compressed_read(), as the folios are allocated using btrfs_alloc_folio_array(), thus the folios should only be released by regular folio_put(), not btrfs_free_compr_folio(). - Rounding up the bio to block size We cannot simply increase bi_size, as that will not increase the length of the last bvec. Thus we have to properly add the last part into the bio. This will be done by the helper, round_up_last_block(). The reason we do not round those bios up at compression time is to get the unaligned compressed size, so that they can be utilized for inline extents. If we round the bios up at *_compress_bio(), then every compressed bio will be larger than or equal to one fs block, resulting no inline compressed extent. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 16 ++-- fs/btrfs/inode.c | 185 ++++++++++++++++++++++------------------- 2 files changed, 107 insertions(+), 94 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6f123ae9a240..83d6103eb788 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -155,13 +155,6 @@ static int compression_decompress(int type, struct list_head *ws, } } -static void btrfs_free_compressed_folios(struct compressed_bio *cb) -{ - for (unsigned int i = 0; i < cb->nr_folios; i++) - btrfs_free_compr_folio(cb->compressed_folios[i]); - kfree(cb->compressed_folios); -} - static int btrfs_decompress_bio(struct compressed_bio *cb); /* @@ -270,12 +263,14 @@ static void end_bbio_compressed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; + struct folio_iter fi; if (!status) status = errno_to_blk_status(btrfs_decompress_bio(cb)); - btrfs_free_compressed_folios(cb); btrfs_bio_end_io(cb->orig_bbio, status); + bio_for_each_folio_all(fi, &bbio->bio) + folio_put(fi.folio); bio_put(&bbio->bio); } @@ -326,6 +321,7 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); + struct folio_iter fi; btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, cb->bbio.bio.bi_status == BLK_STS_OK); @@ -333,7 +329,9 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) if (cb->writeback) end_compressed_writeback(cb); /* Note, our inode could be gone now. */ - btrfs_free_compressed_folios(cb); + bio_for_each_folio_all(fi, &bbio->bio) + btrfs_free_compr_folio(fi.folio); + kfree(cb->compressed_folios); bio_put(&cb->bbio.bio); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 10609b8199a0..61b5eef7feb6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -755,10 +755,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, struct async_extent { u64 start; u64 ram_size; - u64 compressed_size; - struct folio **folios; - unsigned long nr_folios; - int compress_type; + struct compressed_bio *cb; struct list_head list; }; @@ -779,24 +776,18 @@ struct async_cow { struct async_chunk chunks[]; }; -static noinline int add_async_extent(struct async_chunk *cow, - u64 start, u64 ram_size, - u64 compressed_size, - struct folio **folios, - unsigned long nr_folios, - int compress_type) +static int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, + struct compressed_bio *cb) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); if (!async_extent) return -ENOMEM; + ASSERT(ram_size < U32_MAX); async_extent->start = start; async_extent->ram_size = ram_size; - async_extent->compressed_size = compressed_size; - async_extent->folios = folios; - async_extent->nr_folios = nr_folios; - async_extent->compress_type = compress_type; + async_extent->cb = cb; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -870,6 +861,61 @@ static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, return ret; } +static struct folio *compressed_bio_last_folio(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + struct bio_vec *bvec; + phys_addr_t paddr; + + /* + * Make sure all folios have the same min_folio_size. + * + * Otherwise we cannot simply use offset_in_offset(folio, bi_size) to + * calculate the end of the last folio. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); + const u32 min_folio_size = btrfs_min_folio_size(fs_info); + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + ASSERT(folio_size(fi.folio) == min_folio_size); + } + + /* The bio must not be empty. */ + ASSERT(bio->bi_vcnt); + + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + paddr = page_to_phys(bvec->bv_page) + bvec->bv_offset + bvec->bv_len - 1; + return page_folio(phys_to_page(paddr)); +} + +static void zero_last_folio(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + struct folio *last_folio = compressed_bio_last_folio(cb); + const u32 bio_size = bio->bi_iter.bi_size; + const u32 foffset = offset_in_folio(last_folio, bio_size); + + folio_zero_range(last_folio, foffset, folio_size(last_folio) - foffset); +} + +static void round_up_last_block(struct compressed_bio *cb, u32 blocksize) +{ + struct bio *bio = &cb->bbio.bio; + struct folio *last_folio = compressed_bio_last_folio(cb); + const u32 bio_size = bio->bi_iter.bi_size; + const u32 foffset = offset_in_folio(last_folio, bio_size); + bool ret; + + if (IS_ALIGNED(bio_size, blocksize)) + return; + + ret = bio_add_folio(bio, last_folio, round_up(foffset, blocksize) - foffset, foffset); + /* The remaining part should be merged thus never fail. */ + ASSERT(ret); +} + /* * Work queue call back to started compression on a file and pages. * @@ -890,20 +936,18 @@ static void compress_file_range(struct btrfs_work *work) struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; + struct compressed_bio *cb = NULL; const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; u64 i_size; + u32 cur_len; int ret = 0; - struct folio **folios = NULL; - unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int loff; - int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; @@ -942,9 +986,10 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - folios = NULL; - nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); + total_in = 0; + cur_len = min(end + 1 - start, BTRFS_MAX_UNCOMPRESSED); + ret = 0; + cb = NULL; /* * we don't want to send crud past the end of i_size through @@ -959,10 +1004,6 @@ static void compress_file_range(struct btrfs_work *work) if (actual_end <= start) goto cleanup_and_bail_uncompressed; - total_compressed = min_t(unsigned long, actual_end - start, BTRFS_MAX_UNCOMPRESSED); - total_in = 0; - ret = 0; - /* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we @@ -971,15 +1012,6 @@ static void compress_file_range(struct btrfs_work *work) if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); - if (!folios) { - /* - * Memory allocation failure is not a fatal error, we can fall - * back to uncompressed code. - */ - goto cleanup_and_bail_uncompressed; - } - if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; compress_level = inode->defrag_compress_level; @@ -988,11 +1020,15 @@ static void compress_file_range(struct btrfs_work *work) } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type, compress_level, - inode, start, folios, &nr_folios, &total_in, - &total_compressed); - if (ret) + cb = btrfs_compress_bio(inode, start, cur_len, compress_type, + compress_level, async_chunk->write_flags); + if (IS_ERR(cb)) { + cb = NULL; goto mark_incompressible; + } + + total_compressed = cb->bbio.bio.bi_iter.bi_size; + total_in = cur_len; /* * Zero the tail end of the last folio, as we might be sending it down @@ -1000,7 +1036,7 @@ static void compress_file_range(struct btrfs_work *work) */ loff = (total_compressed & (min_folio_size - 1)); if (loff) - folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); + zero_last_folio(cb); /* * Try to create an inline extent. @@ -1016,11 +1052,13 @@ static void compress_file_range(struct btrfs_work *work) BTRFS_COMPRESS_NONE, NULL, false); else ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, - compress_type, folios[0], false); + compress_type, + bio_first_folio_all(&cb->bbio.bio), false); if (ret <= 0) { + cleanup_compressed_bio(cb); if (ret < 0) mapping_set_error(mapping, -EIO); - goto free_pages; + return; } /* @@ -1028,6 +1066,7 @@ static void compress_file_range(struct btrfs_work *work) * block size boundary so the allocator does sane things. */ total_compressed = ALIGN(total_compressed, blocksize); + round_up_last_block(cb, blocksize); /* * One last check to make sure the compression is really a win, compare @@ -1038,12 +1077,12 @@ static void compress_file_range(struct btrfs_work *work) if (total_compressed + blocksize > total_in) goto mark_incompressible; + /* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, - nr_folios, compress_type); + ret = add_async_extent(async_chunk, start, total_in, cb); BUG_ON(ret); if (start + total_in < end) { start += total_in; @@ -1056,33 +1095,10 @@ static void compress_file_range(struct btrfs_work *work) if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: - ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); + ret = add_async_extent(async_chunk, start, end - start + 1, NULL); BUG_ON(ret); -free_pages: - if (folios) { - for (i = 0; i < nr_folios; i++) { - WARN_ON(folios[i]->mapping); - btrfs_free_compr_folio(folios[i]); - } - kfree(folios); - } -} - -static void free_async_extent_pages(struct async_extent *async_extent) -{ - int i; - - if (!async_extent->folios) - return; - - for (i = 0; i < async_extent->nr_folios; i++) { - WARN_ON(async_extent->folios[i]->mapping); - btrfs_free_compr_folio(async_extent->folios[i]); - } - kfree(async_extent->folios); - async_extent->nr_folios = 0; - async_extent->folios = NULL; + if (cb) + cleanup_compressed_bio(cb); } static void submit_uncompressed_range(struct btrfs_inode *inode, @@ -1129,7 +1145,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; - bool free_pages = false; + u32 compressed_size; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1149,17 +1165,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, locked_folio = async_chunk->locked_folio; } - if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - ASSERT(!async_extent->folios); - ASSERT(async_extent->nr_folios == 0); + if (!async_extent->cb) { submit_uncompressed_range(inode, async_extent, locked_folio); - free_pages = true; goto done; } + compressed_size = async_extent->cb->bbio.bio.bi_iter.bi_size; ret = btrfs_reserve_extent(root, async_extent->ram_size, - async_extent->compressed_size, - async_extent->compressed_size, + compressed_size, compressed_size, 0, *alloc_hint, &ins, true, true); if (ret) { /* @@ -1169,7 +1182,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); - free_pages = true; + cleanup_compressed_bio(async_extent->cb); + async_extent->cb = NULL; goto done; } @@ -1181,7 +1195,9 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, file_extent.ram_bytes = async_extent->ram_size; file_extent.num_bytes = async_extent->ram_size; file_extent.offset = 0; - file_extent.compression = async_extent->compress_type; + file_extent.compression = async_extent->cb->compress_type; + + async_extent->cb->bbio.bio.bi_iter.bi_sector = ins.objectid >> SECTOR_SHIFT; em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { @@ -1197,22 +1213,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(ordered); goto out_free_reserve; } + async_extent->cb->bbio.ordered = ordered; btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - btrfs_submit_compressed_write(ordered, - async_extent->folios, /* compressed_folios */ - async_extent->nr_folios, - async_chunk->write_flags, true); + btrfs_submit_bbio(&async_extent->cb->bbio, 0); + async_extent->cb = NULL; + *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); - if (free_pages) - free_async_extent_pages(async_extent); kfree(async_extent); return; @@ -1227,7 +1241,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - free_async_extent_pages(async_extent); + if (async_extent->cb) + cleanup_compressed_bio(async_extent->cb); if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); btrfs_debug(fs_info, From 26902be0cd0997b34ef13593e35ef3501a3c70b5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:43 +1030 Subject: [PATCH 134/137] btrfs: remove the old btrfs_compress_folios() infrastructure Since it's been replaced by btrfs_compress_bio(), remove all involved functions. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 70 --------------- fs/btrfs/compression.h | 12 --- fs/btrfs/inode.c | 2 +- fs/btrfs/lzo.c | 188 ---------------------------------------- fs/btrfs/zlib.c | 189 ----------------------------------------- fs/btrfs/zstd.c | 189 ----------------------------------------- 6 files changed, 1 insertion(+), 649 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 83d6103eb788..875e0d2bcb5d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -86,37 +86,6 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) return false; } -static int compression_compress_pages(int type, struct list_head *ws, - struct btrfs_inode *inode, u64 start, - struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) -{ - switch (type) { - case BTRFS_COMPRESS_ZLIB: - return zlib_compress_folios(ws, inode, start, folios, - out_folios, total_in, total_out); - case BTRFS_COMPRESS_LZO: - return lzo_compress_folios(ws, inode, start, folios, - out_folios, total_in, total_out); - case BTRFS_COMPRESS_ZSTD: - return zstd_compress_folios(ws, inode, start, folios, - out_folios, total_in, total_out); - case BTRFS_COMPRESS_NONE: - default: - /* - * This can happen when compression races with remount setting - * it to 'no compress', while caller doesn't call - * inode_need_compress() to check if we really need to - * compress. - * - * Not a big deal, just need to inform caller that we - * haven't allocated any pages yet. - */ - *out_folios = 0; - return -E2BIG; - } -} - static int compression_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { @@ -1023,45 +992,6 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, return 0; } -/* - * Given an address space and start and length, compress the bytes into @pages - * that are allocated on demand. - * - * @type_level is encoded algorithm and level, where level 0 means whatever - * default the algorithm chooses and is opaque here; - * - compression algo are 0-3 - * - the level are bits 4-7 - * - * @out_folios is an in/out parameter, holds maximum number of folios to allocate - * and returns number of actually allocated folios - * - * @total_in is used to return the number of bytes actually read. It - * may be smaller than the input length if we had to exit early because we - * ran out of room in the folios array or because we cross the - * max_out threshold. - * - * @total_out is an in/out parameter, must be set to the input length and will - * be also used to return the total number of compressed bytes - */ -int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - const unsigned long orig_len = *total_out; - struct list_head *workspace; - int ret; - - level = btrfs_compress_set_level(type, level); - workspace = get_workspace(fs_info, type, level); - ret = compression_compress_pages(type, workspace, inode, start, folios, - out_folios, total_in, total_out); - /* The total read-in bytes should be no larger than the input. */ - ASSERT(*total_in <= orig_len); - put_workspace(fs_info, type, workspace); - return ret; -} - /* * Given an address space and start and length, compress the page cache * contents into @cb. diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index fd0cce5d07cf..7dc48e556313 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -91,9 +91,6 @@ int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); bool btrfs_compress_level_valid(unsigned int type, int level); -int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, @@ -160,9 +157,6 @@ static inline void cleanup_compressed_bio(struct compressed_bio *cb) bio_put(bio); } -int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, @@ -172,9 +166,6 @@ struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned i void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); -int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, @@ -183,9 +174,6 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out); int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 61b5eef7feb6..279e04892288 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -966,7 +966,7 @@ static void compress_file_range(struct btrfs_work *work) /* * All the folios should have been locked thus no failure. * - * And even if some folios are missing, btrfs_compress_folios() + * And even if some folios are missing, btrfs_compress_bio() * would handle them correctly, so here just do an ASSERT() check for * early logic errors. */ diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 96e408add9b8..8e20497afffe 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -122,98 +122,6 @@ static inline size_t read_compress_length(const char *buf) return le32_to_cpu(dlen); } -/* - * Will do: - * - * - Write a segment header into the destination - * - Copy the compressed buffer into the destination - * - Make sure we have enough space in the last sector to fit a segment header - * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. - * - * Will allocate new pages when needed. - */ -static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, - char *compressed_data, - size_t compressed_size, - struct folio **out_folios, - unsigned long max_nr_folio, - u32 *cur_out) -{ - const u32 sectorsize = fs_info->sectorsize; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; - u32 sector_bytes_left; - u32 orig_out; - struct folio *cur_folio; - char *kaddr; - - if ((*cur_out >> min_folio_shift) >= max_nr_folio) - return -E2BIG; - - /* - * We never allow a segment header crossing sector boundary, previous - * run should ensure we have enough space left inside the sector. - */ - ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - - cur_folio = out_folios[*cur_out >> min_folio_shift]; - /* Allocate a new page */ - if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(fs_info); - if (!cur_folio) - return -ENOMEM; - out_folios[*cur_out >> min_folio_shift] = cur_folio; - } - - kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); - write_compress_length(kaddr, compressed_size); - *cur_out += LZO_LEN; - - orig_out = *cur_out; - - /* Copy compressed data */ - while (*cur_out - orig_out < compressed_size) { - u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, - orig_out + compressed_size - *cur_out); - - kunmap_local(kaddr); - - if ((*cur_out >> min_folio_shift) >= max_nr_folio) - return -E2BIG; - - cur_folio = out_folios[*cur_out >> min_folio_shift]; - /* Allocate a new page */ - if (!cur_folio) { - cur_folio = btrfs_alloc_compr_folio(fs_info); - if (!cur_folio) - return -ENOMEM; - out_folios[*cur_out >> min_folio_shift] = cur_folio; - } - kaddr = kmap_local_folio(cur_folio, 0); - - memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), - compressed_data + *cur_out - orig_out, copy_len); - - *cur_out += copy_len; - } - - /* - * Check if we can fit the next segment header into the remaining space - * of the sector. - */ - sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; - if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) - goto out; - - /* The remaining size is not enough, pad it with zeros */ - memset(kaddr + offset_in_page(*cur_out), 0, - sector_bytes_left); - *cur_out += sector_bytes_left; - -out: - kunmap_local(kaddr); - return 0; -} - /* * Write data into @out_folio and queue it into @out_bio. * @@ -365,102 +273,6 @@ static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, return write_and_queue_folio(out_bio, out_folio, total_out, sector_bytes_left); } -int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct workspace *workspace = list_entry(ws, struct workspace, list); - const u32 sectorsize = fs_info->sectorsize; - const u32 min_folio_size = btrfs_min_folio_size(fs_info); - struct address_space *mapping = inode->vfs_inode.i_mapping; - struct folio *folio_in = NULL; - char *sizes_ptr; - const unsigned long max_nr_folio = *out_folios; - int ret = 0; - /* Points to the file offset of input data */ - u64 cur_in = start; - /* Points to the current output byte */ - u32 cur_out = 0; - u32 len = *total_out; - - ASSERT(max_nr_folio > 0); - *out_folios = 0; - *total_out = 0; - *total_in = 0; - - /* - * Skip the header for now, we will later come back and write the total - * compressed size - */ - cur_out += LZO_LEN; - while (cur_in < start + len) { - char *data_in; - const u32 sectorsize_mask = sectorsize - 1; - u32 sector_off = (cur_in - start) & sectorsize_mask; - u32 in_len; - size_t out_len; - - /* Get the input page first */ - if (!folio_in) { - ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); - if (ret < 0) - goto out; - } - - /* Compress at most one sector of data each time */ - in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); - ASSERT(in_len); - data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); - ret = lzo1x_1_compress(data_in, in_len, - workspace->cbuf, &out_len, - workspace->mem); - kunmap_local(data_in); - if (unlikely(ret < 0)) { - /* lzo1x_1_compress never fails. */ - ret = -EIO; - goto out; - } - - ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, - folios, max_nr_folio, - &cur_out); - if (ret < 0) - goto out; - - cur_in += in_len; - - /* - * Check if we're making it bigger after two sectors. And if - * it is so, give up. - */ - if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { - ret = -E2BIG; - goto out; - } - - /* Check if we have reached folio boundary. */ - if (IS_ALIGNED(cur_in, min_folio_size)) { - folio_put(folio_in); - folio_in = NULL; - } - } - - /* Store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_folio(folios[0], 0); - write_compress_length(sizes_ptr, cur_out); - kunmap_local(sizes_ptr); - - ret = 0; - *total_out = cur_out; - *total_in = cur_in - start; -out: - if (folio_in) - folio_put(folio_in); - *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); - return ret; -} - int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_inode *inode = cb->bbio.inode; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index fa463e0e8527..0a8fcee16428 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -145,195 +145,6 @@ static int copy_data_into_buffer(struct address_space *mapping, return 0; } -int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct workspace *workspace = list_entry(ws, struct workspace, list); - struct address_space *mapping = inode->vfs_inode.i_mapping; - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; - const u32 min_folio_size = btrfs_min_folio_size(fs_info); - int ret; - char *data_in = NULL; - char *cfolio_out; - int nr_folios = 0; - struct folio *in_folio = NULL; - struct folio *out_folio = NULL; - unsigned long len = *total_out; - unsigned long nr_dest_folios = *out_folios; - const unsigned long max_out = nr_dest_folios << min_folio_shift; - const u32 blocksize = fs_info->sectorsize; - const u64 orig_end = start + len; - - *out_folios = 0; - *total_out = 0; - *total_in = 0; - - ret = zlib_deflateInit(&workspace->strm, workspace->level); - if (unlikely(ret != Z_OK)) { - btrfs_err(fs_info, - "zlib compression init failed, error %d root %llu inode %llu offset %llu", - ret, btrfs_root_id(inode->root), btrfs_ino(inode), start); - ret = -EIO; - goto out; - } - - workspace->strm.total_in = 0; - workspace->strm.total_out = 0; - - out_folio = btrfs_alloc_compr_folio(fs_info); - if (out_folio == NULL) { - ret = -ENOMEM; - goto out; - } - cfolio_out = folio_address(out_folio); - folios[0] = out_folio; - nr_folios = 1; - - workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = 0; - workspace->strm.next_out = cfolio_out; - workspace->strm.avail_out = min_folio_size; - - while (workspace->strm.total_in < len) { - /* - * Get next input pages and copy the contents to - * the workspace buffer if required. - */ - if (workspace->strm.avail_in == 0) { - unsigned long bytes_left = len - workspace->strm.total_in; - unsigned int copy_length = min(bytes_left, workspace->buf_size); - - /* - * For s390 hardware accelerated zlib, and our folio is smaller - * than the copy_length, we need to fill the buffer so that - * we can take full advantage of hardware acceleration. - */ - if (need_special_buffer(fs_info)) { - ret = copy_data_into_buffer(mapping, workspace, - start, copy_length); - if (ret < 0) - goto out; - start += copy_length; - workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = copy_length; - } else { - unsigned int cur_len; - - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); - data_in = kmap_local_folio(in_folio, - offset_in_folio(in_folio, start)); - start += cur_len; - workspace->strm.next_in = data_in; - workspace->strm.avail_in = cur_len; - } - } - - ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); - if (unlikely(ret != Z_OK)) { - btrfs_warn(fs_info, - "zlib compression failed, error %d root %llu inode %llu offset %llu", - ret, btrfs_root_id(inode->root), btrfs_ino(inode), - start); - zlib_deflateEnd(&workspace->strm); - ret = -EIO; - goto out; - } - - /* we're making it bigger, give up */ - if (workspace->strm.total_in > blocksize * 2 && - workspace->strm.total_in < - workspace->strm.total_out) { - ret = -E2BIG; - goto out; - } - /* we need another page for writing out. Test this - * before the total_in so we will pull in a new page for - * the stream end if required - */ - if (workspace->strm.avail_out == 0) { - if (nr_folios == nr_dest_folios) { - ret = -E2BIG; - goto out; - } - out_folio = btrfs_alloc_compr_folio(fs_info); - if (out_folio == NULL) { - ret = -ENOMEM; - goto out; - } - cfolio_out = folio_address(out_folio); - folios[nr_folios] = out_folio; - nr_folios++; - workspace->strm.avail_out = min_folio_size; - workspace->strm.next_out = cfolio_out; - } - /* we're all done */ - if (workspace->strm.total_in >= len) - break; - if (workspace->strm.total_out > max_out) - break; - } - workspace->strm.avail_in = 0; - /* - * Call deflate with Z_FINISH flush parameter providing more output - * space but no more input data, until it returns with Z_STREAM_END. - */ - while (ret != Z_STREAM_END) { - ret = zlib_deflate(&workspace->strm, Z_FINISH); - if (ret == Z_STREAM_END) - break; - if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) { - zlib_deflateEnd(&workspace->strm); - ret = -EIO; - goto out; - } else if (workspace->strm.avail_out == 0) { - /* Get another folio for the stream end. */ - if (nr_folios == nr_dest_folios) { - ret = -E2BIG; - goto out; - } - out_folio = btrfs_alloc_compr_folio(fs_info); - if (out_folio == NULL) { - ret = -ENOMEM; - goto out; - } - cfolio_out = folio_address(out_folio); - folios[nr_folios] = out_folio; - nr_folios++; - workspace->strm.avail_out = min_folio_size; - workspace->strm.next_out = cfolio_out; - } - } - zlib_deflateEnd(&workspace->strm); - - if (workspace->strm.total_out >= workspace->strm.total_in) { - ret = -E2BIG; - goto out; - } - - ret = 0; - *total_out = workspace->strm.total_out; - *total_in = workspace->strm.total_in; -out: - *out_folios = nr_folios; - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - } - - return ret; -} - int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_inode *inode = cb->bbio.inode; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 135b0b32579f..32fd7f5454d3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -396,195 +396,6 @@ struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) return ERR_PTR(-ENOMEM); } -int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, - u64 start, struct folio **folios, unsigned long *out_folios, - unsigned long *total_in, unsigned long *total_out) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct workspace *workspace = list_entry(ws, struct workspace, list); - struct address_space *mapping = inode->vfs_inode.i_mapping; - zstd_cstream *stream; - int ret = 0; - int nr_folios = 0; - struct folio *in_folio = NULL; /* The current folio to read. */ - struct folio *out_folio = NULL; /* The current folio to write to. */ - unsigned long tot_in = 0; - unsigned long tot_out = 0; - unsigned long len = *total_out; - const unsigned long nr_dest_folios = *out_folios; - const u64 orig_end = start + len; - const u32 blocksize = fs_info->sectorsize; - const u32 min_folio_size = btrfs_min_folio_size(fs_info); - unsigned long max_out = nr_dest_folios * min_folio_size; - unsigned int cur_len; - - workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); - *out_folios = 0; - *total_out = 0; - *total_in = 0; - - /* Initialize the stream */ - stream = zstd_init_cstream(&workspace->params, len, workspace->mem, - workspace->size); - if (unlikely(!stream)) { - btrfs_err(fs_info, - "zstd compression init level %d failed, root %llu inode %llu offset %llu", - workspace->req_level, btrfs_root_id(inode->root), - btrfs_ino(inode), start); - ret = -EIO; - goto out; - } - - /* map in the first page of input data */ - ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); - if (ret < 0) - goto out; - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); - workspace->in_buf.pos = 0; - workspace->in_buf.size = cur_len; - - /* Allocate and map in the output buffer */ - out_folio = btrfs_alloc_compr_folio(fs_info); - if (out_folio == NULL) { - ret = -ENOMEM; - goto out; - } - folios[nr_folios++] = out_folio; - workspace->out_buf.dst = folio_address(out_folio); - workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); - - while (1) { - size_t ret2; - - ret2 = zstd_compress_stream(stream, &workspace->out_buf, - &workspace->in_buf); - if (unlikely(zstd_is_error(ret2))) { - btrfs_warn(fs_info, -"zstd compression level %d failed, error %d root %llu inode %llu offset %llu", - workspace->req_level, zstd_get_error_code(ret2), - btrfs_root_id(inode->root), btrfs_ino(inode), - start); - ret = -EIO; - goto out; - } - - /* Check to see if we are making it bigger */ - if (tot_in + workspace->in_buf.pos > blocksize * 2 && - tot_in + workspace->in_buf.pos < - tot_out + workspace->out_buf.pos) { - ret = -E2BIG; - goto out; - } - - /* We've reached the end of our output range */ - if (workspace->out_buf.pos >= max_out) { - tot_out += workspace->out_buf.pos; - ret = -E2BIG; - goto out; - } - - /* Check if we need more output space */ - if (workspace->out_buf.pos == workspace->out_buf.size) { - tot_out += min_folio_size; - max_out -= min_folio_size; - if (nr_folios == nr_dest_folios) { - ret = -E2BIG; - goto out; - } - out_folio = btrfs_alloc_compr_folio(fs_info); - if (out_folio == NULL) { - ret = -ENOMEM; - goto out; - } - folios[nr_folios++] = out_folio; - workspace->out_buf.dst = folio_address(out_folio); - workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); - } - - /* We've reached the end of the input */ - if (workspace->in_buf.pos >= len) { - tot_in += workspace->in_buf.pos; - break; - } - - /* Check if we need more input */ - if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += workspace->in_buf.size; - kunmap_local(workspace->in_buf.src); - workspace->in_buf.src = NULL; - folio_put(in_folio); - start += cur_len; - len -= cur_len; - ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); - if (ret < 0) - goto out; - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, - offset_in_folio(in_folio, start)); - workspace->in_buf.pos = 0; - workspace->in_buf.size = cur_len; - } - } - while (1) { - size_t ret2; - - ret2 = zstd_end_stream(stream, &workspace->out_buf); - if (unlikely(zstd_is_error(ret2))) { - btrfs_err(fs_info, -"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", - workspace->req_level, zstd_get_error_code(ret2), - btrfs_root_id(inode->root), btrfs_ino(inode), - start); - ret = -EIO; - goto out; - } - if (ret2 == 0) { - tot_out += workspace->out_buf.pos; - break; - } - if (workspace->out_buf.pos >= max_out) { - tot_out += workspace->out_buf.pos; - ret = -E2BIG; - goto out; - } - - tot_out += min_folio_size; - max_out -= min_folio_size; - if (nr_folios == nr_dest_folios) { - ret = -E2BIG; - goto out; - } - out_folio = btrfs_alloc_compr_folio(fs_info); - if (out_folio == NULL) { - ret = -ENOMEM; - goto out; - } - folios[nr_folios++] = out_folio; - workspace->out_buf.dst = folio_address(out_folio); - workspace->out_buf.pos = 0; - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); - } - - if (tot_out >= tot_in) { - ret = -E2BIG; - goto out; - } - - ret = 0; - *total_in = tot_in; - *total_out = tot_out; -out: - *out_folios = nr_folios; - if (workspace->in_buf.src) { - kunmap_local(workspace->in_buf.src); - folio_put(in_folio); - } - return ret; -} - int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) { struct btrfs_inode *inode = cb->bbio.inode; From dafcfa1c8e377a3d8e2e1d72a76435b57ed1ac7d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:44 +1030 Subject: [PATCH 135/137] btrfs: get rid of compressed_folios[] usage for compressed read Currently btrfs_submit_compressed_read() still uses compressed_bio::compressed_folios[] array. Change it to allocate each folio and queue them into the compressed bio so that we do not need to allocate that array. Considering how small each compressed read bio is (less than 128KiB), we do not benefit that much from btrfs_alloc_folio_array() anyway, while we may benefit more from btrfs_alloc_compr_folio() by using the global folio pool. So changing from btrfs_alloc_folio_array() to btrfs_alloc_compr_folio() in a loop should still be fine. This removes one error path, and paves the way to completely remove compressed_folios[] array. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 875e0d2bcb5d..8501a5e4132d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -239,7 +239,7 @@ static void end_bbio_compressed_read(struct btrfs_bio *bbio) btrfs_bio_end_io(cb->orig_bbio, status); bio_for_each_folio_all(fi, &bbio->bio) - folio_put(fi.folio); + btrfs_free_compr_folio(fi.folio); bio_put(&bbio->bio); } @@ -537,13 +537,13 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct extent_map_tree *em_tree = &inode->extent_tree; struct compressed_bio *cb; unsigned int compressed_len; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 file_offset = bbio->file_offset; u64 em_len; u64 em_start; struct extent_map *em; unsigned long pflags; int memstall = 0; - blk_status_t status; int ret; /* we need the actual starting offset of this extent in the file */ @@ -551,7 +551,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) { - status = BLK_STS_IOERR; + ret = -EIO; goto out; } @@ -573,27 +573,30 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) btrfs_free_extent_map(em); - cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); - if (!cb->compressed_folios) { - status = BLK_STS_RESOURCE; - goto out_free_bio; - } + for (int i = 0; i * min_folio_size < compressed_len; i++) { + struct folio *folio; + u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size); - ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, - cb->compressed_folios); - if (ret) { - status = BLK_STS_RESOURCE; - goto out_free_compressed_pages; + folio = btrfs_alloc_compr_folio(fs_info); + if (!folio) { + ret = -ENOMEM; + goto out_free_bio; + } + + ret = bio_add_folio(&cb->bbio.bio, folio, cur_len, 0); + if (unlikely(!ret)) { + folio_put(folio); + ret = -EINVAL; + goto out_free_bio; + } } + ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len); add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, &pflags); - /* include any pages we added in add_ra-bio_pages */ cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; - btrfs_add_compressed_bio_folios(cb); if (memstall) psi_memstall_leave(&pflags); @@ -601,12 +604,10 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) btrfs_submit_bbio(&cb->bbio, 0); return; -out_free_compressed_pages: - kfree(cb->compressed_folios); out_free_bio: - bio_put(&cb->bbio.bio); + cleanup_compressed_bio(cb); out: - btrfs_bio_end_io(bbio, status); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); } /* From e1bc83f8b157689e5de4f651b6fbb9dcdccd33c1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:45 +1030 Subject: [PATCH 136/137] btrfs: get rid of compressed_folios[] usage for encoded writes Currently only encoded writes utilized btrfs_submit_compressed_write(), which utilized compressed_bio::compressed_folios[] array. Change the only call site to call the new helper, btrfs_alloc_compressed_write(), to allocate a compressed bio, then queue needed folios into that bio, and finally call btrfs_submit_compressed_write() to submit the compressed bio. This change has one hidden benefit, previously we used btrfs_alloc_folio_array() for the folios of btrfs_submit_compressed_read(), which doesn't utilize the compression page pool for bs == ps cases. Now we call btrfs_alloc_compr_folio() which will benefit from the page pool. The other obvious benefit is that we no longer need to allocate an array to hold all those folios, thus one less error path. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 52 ++++++++++++++-------------------- fs/btrfs/compression.h | 6 ++-- fs/btrfs/inode.c | 63 +++++++++++++++++++++++------------------- 3 files changed, 58 insertions(+), 63 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8501a5e4132d..dcd7bc60107d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -304,25 +304,6 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) bio_put(&cb->bbio.bio); } -static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) -{ - struct bio *bio = &cb->bbio.bio; - u32 offset = 0; - unsigned int findex = 0; - - while (offset < cb->compressed_len) { - struct folio *folio = cb->compressed_folios[findex]; - u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); - int ret; - - /* Maximum compressed extent is smaller than bio size limit. */ - ret = bio_add_folio(bio, folio, len, 0); - ASSERT(ret); - offset += len; - findex++; - } -} - /* * worker function to build and submit bios for previously compressed pages. * The corresponding pages in the inode should be marked for writeback @@ -333,34 +314,43 @@ static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) * the end io hooks. */ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct folio **compressed_folios, - unsigned int nr_folios, - blk_opf_t write_flags, - bool writeback) + struct compressed_bio *cb) { struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct compressed_bio *cb; ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); + ASSERT(cb->writeback); - cb = alloc_compressed_bio(inode, ordered->file_offset, - REQ_OP_WRITE | write_flags, - end_bbio_compressed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; - cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; - cb->writeback = writeback; - cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; - btrfs_add_compressed_bio_folios(cb); btrfs_submit_bbio(&cb->bbio, 0); } +/* + * Allocate a compressed write bio for @inode file offset @start length @len. + * + * The caller still needs to properly queue all folios and populate involved + * members. + */ +struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, + u64 start, u64 len) +{ + struct compressed_bio *cb; + + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE, end_bbio_compressed_write); + cb->start = start; + cb->len = len; + cb->writeback = true; + + return cb; +} + /* * Add extra pages in the same compressed file extent so that we don't need to * re-read the same extent again and again. diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 7dc48e556313..2d3a28b26997 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -96,10 +96,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); +struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, + u64 start, u64 len); void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct folio **compressed_folios, - unsigned int nr_folios, blk_opf_t write_flags, - bool writeback); + struct compressed_bio *cb); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 279e04892288..1aebd2ee2704 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9828,12 +9828,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; + struct compressed_bio *cb = NULL; int compression; size_t orig_count; + const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_folios, i; - struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -9922,39 +9922,46 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); - if (!folios) - return -ENOMEM; - for (i = 0; i < nr_folios; i++) { - size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + + cb = btrfs_alloc_compressed_write(inode, start, num_bytes); + for (int i = 0; i * min_folio_size < disk_num_bytes; i++) { + struct folio *folio; + size_t bytes = min(min_folio_size, iov_iter_count(from)); char *kaddr; - folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); - if (!folios[i]) { + folio = btrfs_alloc_compr_folio(fs_info); + if (!folio) { ret = -ENOMEM; - goto out_folios; + goto out_cb; } - kaddr = kmap_local_folio(folios[i], 0); - if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap_local(kaddr); - ret = -EFAULT; - goto out_folios; - } - if (bytes < PAGE_SIZE) - memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kaddr = kmap_local_folio(folio, 0); + ret = copy_from_iter(kaddr, bytes, from); kunmap_local(kaddr); + if (ret != bytes) { + folio_put(folio); + ret = -EFAULT; + goto out_cb; + } + if (bytes < min_folio_size) + folio_zero_range(folio, bytes, min_folio_size - bytes); + ret = bio_add_folio(&cb->bbio.bio, folio, folio_size(folio), 0); + if (unlikely(!ret)) { + folio_put(folio); + ret = -EINVAL; + goto out_cb; + } } + ASSERT(cb->bbio.bio.bi_iter.bi_size == disk_num_bytes); for (;;) { ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) - goto out_folios; + goto out_cb; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_folios; + goto out_cb; btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -9986,7 +9993,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { ret = __cow_file_range_inline(inode, encoded->len, - orig_count, compression, folios[0], + orig_count, compression, + bio_first_folio_all(&cb->bbio.bio), true); if (ret <= 0) { if (ret == 0) @@ -10031,7 +10039,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); + btrfs_submit_compressed_write(ordered, cb); ret = orig_count; goto out; @@ -10053,12 +10061,9 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: btrfs_unlock_extent(io_tree, start, end, &cached_state); -out_folios: - for (i = 0; i < nr_folios; i++) { - if (folios[i]) - folio_put(folios[i]); - } - kvfree(folios); +out_cb: + if (cb) + cleanup_compressed_bio(cb); out: if (ret >= 0) iocb->ki_pos += encoded->len; From 161ab30da6899f31f8128cec7c833e99fa4d06d2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 29 Jan 2026 13:53:46 +1030 Subject: [PATCH 137/137] btrfs: get rid of compressed_bio::compressed_folios[] Now there is no one utilizing that member, we can safely remove it along with compressed_bio::nr_folios member. The size is reduced from 352 to 336 bytes on x86_64. Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 - fs/btrfs/compression.h | 6 ------ 2 files changed, 7 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index dcd7bc60107d..1e7174ad32e2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -300,7 +300,6 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) /* Note, our inode could be gone now. */ bio_for_each_folio_all(fi, &bbio->bio) btrfs_free_compr_folio(fi.folio); - kfree(cb->compressed_folios); bio_put(&cb->bbio.bio); } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2d3a28b26997..65b8bc4bbe0b 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -42,12 +42,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of compressed folios in the array. */ - unsigned int nr_folios; - - /* The folios with the compressed data on them. */ - struct folio **compressed_folios; - /* starting offset in the inode for our pages */ u64 start;