From ad0c23c97b4e9ed8233905294dd86659009c6e8a Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Thu, 5 Mar 2026 11:06:44 +0100
Subject: [PATCH] btrfs: zoned: limit number of zones reclaimed in
 flush_space()

Limit the number of zones reclaimed in flush_space()'s RECLAIM_ZONES
state.

This prevents possibly long running reclaim sweeps to block other tasks in
the system, while the system is under pressure anyways, causing the
tasks to hang.

An example of this can be seen here, triggered by fstests generic/551:

generic/551        [   27.042349] run fstests generic/551 at 2026-02-27 11:05:30
 BTRFS: device fsid 78c16e29-20d9-4c8e-bc04-7ba431be38ff devid 1 transid 8 /dev/vdb (254:16) scanned by mount (806)
 BTRFS info (device vdb): first mount of filesystem 78c16e29-20d9-4c8e-bc04-7ba431be38ff
 BTRFS info (device vdb): using crc32c checksum algorithm
 BTRFS info (device vdb): host-managed zoned block device /dev/vdb, 64 zones of 268435456 bytes
 BTRFS info (device vdb): zoned mode enabled with zone size 268435456
 BTRFS info (device vdb): checking UUID tree
 BTRFS info (device vdb): enabling free space tree
 INFO: task kworker/u38:1:90 blocked for more than 120 seconds.
       Not tainted 7.0.0-rc1+ #345
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 task:kworker/u38:1   state:D stack:0     pid:90    tgid:90    ppid:2      task_flags:0x4208060 flags:0x00080000
 Workqueue: events_unbound btrfs_async_reclaim_data_space
 Call Trace:
  <TASK>
  __schedule+0x34f/0xe70
  schedule+0x41/0x140
  schedule_timeout+0xa3/0x110
  ? mark_held_locks+0x40/0x70
  ? lockdep_hardirqs_on_prepare+0xd8/0x1c0
  ? trace_hardirqs_on+0x18/0x100
  ? lockdep_hardirqs_on+0x84/0x130
  ? _raw_spin_unlock_irq+0x33/0x50
  wait_for_completion+0xa4/0x150
  ? __flush_work+0x24c/0x550
  __flush_work+0x339/0x550
  ? __pfx_wq_barrier_func+0x10/0x10
  ? wait_for_completion+0x39/0x150
  flush_space+0x243/0x660
  ? find_held_lock+0x2b/0x80
  ? kvm_sched_clock_read+0x11/0x20
  ? local_clock_noinstr+0x17/0x110
  ? local_clock+0x15/0x30
  ? lock_release+0x1b7/0x4b0
  do_async_reclaim_data_space+0xe8/0x160
  btrfs_async_reclaim_data_space+0x19/0x30
  process_one_work+0x20a/0x5f0
  ? lock_is_held_type+0xcd/0x130
  worker_thread+0x1e2/0x3c0
  ? __pfx_worker_thread+0x10/0x10
  kthread+0x103/0x150
  ? __pfx_kthread+0x10/0x10
  ret_from_fork+0x20d/0x320
  ? __pfx_kthread+0x10/0x10
  ret_from_fork_asm+0x1a/0x30
  </TASK>

 Showing all locks held in the system:
 1 lock held by khungtaskd/67:
  #0: ffffffff824d58e0 (rcu_read_lock){....}-{1:3}, at: debug_show_all_locks+0x3d/0x194
 2 locks held by kworker/u38:1/90:
  #0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0
  #1: ffffc90000c17e58 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0
 5 locks held by kworker/u39:1/191:
  #0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0
  #1: ffffc90000dfbe58 ((work_completion)(&fs_info->reclaim_bgs_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0
  #2: ffff888101da0420 (sb_writers#9){.+.+}-{0:0}, at: process_one_work+0x20a/0x5f0
  #3: ffff88811040a648 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: btrfs_reclaim_bgs_work+0x1de/0x770
  #4: ffff888110408a18 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x95a/0x20f0
 1 lock held by aio-dio-write-v/980:
  #0: ffff888110093008 (&sb->s_type->i_mutex_key#15){++++}-{4:4}, at: btrfs_inode_lock+0x51/0xb0

 =============================================

To prevent these long running reclaims from blocking the system, only
reclaim 5 block_groups in the RECLAIM_ZONES state of flush_space(). Also
as these reclaims are now constrained, it opens up the use for a
synchronous call to brtfs_reclaim_block_groups(), eliminating the need
to place the reclaim task on a workqueue and then flushing the workqueue
again.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 13 +++++++++----
 fs/btrfs/block-group.h |  1 +
 fs/btrfs/space-info.c  |  6 ++++--
 3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 72fc9b3b6dc0..fa6e49a4ba37 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1909,7 +1909,7 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b
 	return true;
 }
 
-static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
+static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed)
 {
 	struct btrfs_fs_info *fs_info = bg->fs_info;
 	struct btrfs_space_info *space_info = bg->space_info;
@@ -2036,15 +2036,18 @@ static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
 	if (space_info->total_bytes < old_total)
 		btrfs_set_periodic_reclaim_ready(space_info, true);
 	spin_unlock(&space_info->lock);
+	if (!ret)
+		(*reclaimed)++;
 
 	return ret;
 }
 
-static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
+void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit)
 {
 	struct btrfs_block_group *bg;
 	struct btrfs_space_info *space_info;
 	LIST_HEAD(retry_list);
+	int reclaimed = 0;
 
 	if (!btrfs_should_reclaim(fs_info))
 		return;
@@ -2080,7 +2083,7 @@ static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
 
 		space_info = bg->space_info;
 		spin_unlock(&fs_info->unused_bgs_lock);
-		ret = btrfs_reclaim_block_group(bg);
+		ret = btrfs_reclaim_block_group(bg, &reclaimed);
 
 		if (ret && !READ_ONCE(space_info->periodic_reclaim))
 			btrfs_link_bg_list(bg, &retry_list);
@@ -2099,6 +2102,8 @@ static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
 		if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
 			goto end;
 		spin_lock(&fs_info->unused_bgs_lock);
+		if (reclaimed >= limit)
+			break;
 	}
 	spin_unlock(&fs_info->unused_bgs_lock);
 	mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -2114,7 +2119,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 	struct btrfs_fs_info *fs_info =
 		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
 
-	btrfs_reclaim_block_groups(fs_info);
+	btrfs_reclaim_block_groups(fs_info, -1);
 }
 
 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c03e04292900..0504cb357992 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -350,6 +350,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_chunk_map *map);
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit);
 void btrfs_reclaim_bgs_work(struct work_struct *work);
 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 0e5274c3b988..e017bb182c8c 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -212,6 +212,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 
 #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET			(10ULL)
 
+#define BTRFS_ZONED_SYNC_RECLAIM_BATCH				(5)
+
 /*
  * Calculate chunk size depending on volume type (regular or zoned).
  */
@@ -918,8 +920,8 @@ static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes,
 		if (btrfs_is_zoned(fs_info)) {
 			btrfs_reclaim_sweep(fs_info);
 			btrfs_delete_unused_bgs(fs_info);
-			btrfs_reclaim_bgs(fs_info);
-			flush_work(&fs_info->reclaim_bgs_work);
+			btrfs_reclaim_block_groups(fs_info,
+						   BTRFS_ZONED_SYNC_RECLAIM_BATCH);
 			ASSERT(current->journal_info == NULL);
 			ret = btrfs_commit_current_transaction(root);
 		} else {