From 7098ba57c443868cab250a3a5db72c89ffbd9026 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 01:56:10 -0400
Subject: [PATCH 01/69] bcachefs: fix REFLINK_P_MAY_UPDATE_OPTIONS

If we're doing a reflink copy of existing reflinked data, we may only
set REFLINK_P_MAY_UPDATE_OPTIONS if it was set on the reflink pointer
we're copying from.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/reflink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3a13dbcab6ba..41ca86cba2cd 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -711,7 +711,8 @@ s64 bch2_remap_range(struct bch_fs *c,
 			SET_REFLINK_P_IDX(&dst_p->v, offset);
 
 			if (reflink_p_may_update_opts_field &&
-			    may_change_src_io_path_opts)
+			    may_change_src_io_path_opts &&
+			    REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
 				SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
 		} else {
 			BUG();

From 97e69f12edb19a17589ca0b6f3988b2a28af87c8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 14:20:58 -0400
Subject: [PATCH 02/69] bcachefs: Fix missing
 BTREE_UPDATE_internal_snapshot_node

Repair code will do updates on older snapshot versions, so needs the
correct annotation.

Reported-by: syzbot+42581416dba62b364750@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/namei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index a84b69d6caef..ff7ba297d3ef 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -857,7 +857,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans,
 			n->v.d_inum = cpu_to_le64(target->bi_inum);
 		}
 
-		ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
+		ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
+					BTREE_UPDATE_internal_snapshot_node);
 		if (ret)
 			goto err;
 	}

From ff875d4b474739662d7fefece7532ff77c8b3b70 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 14:37:20 -0400
Subject: [PATCH 03/69] bcachefs: Ensure we print output of run_recovery_pass
 if it errors

Also, don't error out in bucket_ref_update_err(): we don't want to
return -BCH_ERR_cannot_rewind_recovery if it's not an insert, if it's an
overwrite we continue.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 09eb5a543ae4..492a368a9993 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -406,7 +406,15 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf
 	if (insert) {
 		bch2_trans_updates_to_text(buf, trans);
 		__bch2_inconsistent_error(c, buf);
-		ret = -BCH_ERR_bucket_ref_update;
+		/*
+		 * If we're in recovery, run_explicit_recovery_pass might give
+		 * us an error code for rewinding recovery
+		 */
+		if (!ret)
+			ret = -BCH_ERR_bucket_ref_update;
+	} else {
+		/* Always ignore overwrite errors, so that deletion works */
+		ret = 0;
 	}
 
 	if (print || insert)
@@ -971,15 +979,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 			   bch2_data_type_str(type),
 			   bch2_data_type_str(type));
 
-		bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
+		bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
 
-		bch2_run_explicit_recovery_pass(c, &buf,
+		ret = bch2_run_explicit_recovery_pass(c, &buf,
 					BCH_RECOVERY_PASS_check_allocations, 0);
 
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
+		/* Always print, this is always fatal */
+		bch2_print_str(c, KERN_ERR, buf.buf);
 		printbuf_exit(&buf);
-		ret = -BCH_ERR_metadata_bucket_inconsistency;
+		if (!ret)
+			ret = -BCH_ERR_metadata_bucket_inconsistency;
 		goto err;
 	}
 

From dc37dcca8cb71d7ddddfd5035619eeb27c5aab8d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 15:24:00 -0400
Subject: [PATCH 04/69] bcachefs: bch2_kthread_io_clock_wait_once()

Add a version of bch2_kthread_io_clock_wait() that only schedules once -
behaving more like schedule_timeout().

This will be used for fixing rebalance wakeups.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/clock.c | 47 +++++++++++++++++----------------------------
 fs/bcachefs/clock.h |  1 +
 2 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index d6dd12d74d4f..8e9264b5a84e 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
 
 struct io_clock_wait {
 	struct io_timer		io_timer;
-	struct timer_list	cpu_timer;
 	struct task_struct	*task;
 	int			expired;
 };
@@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer)
 	wake_up_process(wait->task);
 }
 
-static void io_clock_cpu_timeout(struct timer_list *timer)
-{
-	struct io_clock_wait *wait = container_of(timer,
-				struct io_clock_wait, cpu_timer);
-
-	wait->expired = 1;
-	wake_up_process(wait->task);
-}
-
 void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
 {
 	struct io_clock_wait wait = {
@@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
 	bch2_io_timer_del(clock, &wait.io_timer);
 }
 
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
-				u64 io_until, unsigned long cpu_timeout)
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
+				     u64 io_until, unsigned long cpu_timeout)
 {
 	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct io_clock_wait wait = {
@@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 
 	bch2_io_timer_add(clock, &wait.io_timer);
 
-	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
-
-	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
-		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
-
-	do {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread && kthread_should_stop())
-			break;
-
-		if (wait.expired)
-			break;
-
-		schedule();
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (!(kthread && kthread_should_stop())) {
+		cpu_timeout = schedule_timeout(cpu_timeout);
 		try_to_freeze();
-	} while (0);
+	}
 
 	__set_current_state(TASK_RUNNING);
-	timer_delete_sync(&wait.cpu_timer);
-	destroy_timer_on_stack(&wait.cpu_timer);
 	bch2_io_timer_del(clock, &wait.io_timer);
+	return cpu_timeout;
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+				u64 io_until, unsigned long cpu_timeout)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+
+	while (!(kthread && kthread_should_stop()) &&
+	       cpu_timeout &&
+	       atomic64_read(&clock->now) < io_until)
+		cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
 }
 
 static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 82c79c8baf92..8769be2aa21e 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,6 +4,7 @@
 
 void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
 void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
 
 void __bch2_increment_clock(struct io_clock *, u64);

From 9e2c3c2ed4772cb0e2ad5b0def08c2c943483445 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 15:29:50 -0400
Subject: [PATCH 05/69] bcachefs: Fix lost rebalance wakeups

Fix a missing wakeup in

'bcachefs set-file-option' -> xattr option update -> inode_write

this was missing because the wakeup needs to happen after transaction
commit. Also, add a 'kick' counter, to make sure we don't miss a wakeup
that occured right after we finished checking the rebalance_work btree.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c              | 6 +++++-
 fs/bcachefs/rebalance.c       | 6 ++++--
 fs/bcachefs/rebalance.h       | 8 +++-----
 fs/bcachefs/rebalance_types.h | 1 +
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ddfe89d84966..adae43223bce 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -124,8 +124,9 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 		goto err;
 
 	struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+	bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
 
-	if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+	if (rebalance_changed) {
 		ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
 		if (ret)
 			goto err;
@@ -146,6 +147,9 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
+	if (rebalance_changed)
+		bch2_rebalance_wakeup(c);
+
 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
 			     "%s: inode %llu:%llu not found when updating",
 			     bch2_err_str(ret),
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index de1ec9e0caa0..dbaabaad1986 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -527,7 +527,7 @@ static void rebalance_wait(struct bch_fs *c)
 		r->state		= BCH_REBALANCE_waiting;
 	}
 
-	bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+	bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
 }
 
 static bool bch2_rebalance_enabled(struct bch_fs *c)
@@ -544,6 +544,7 @@ static int do_rebalance(struct moving_context *ctxt)
 	struct bch_fs_rebalance *r = &c->rebalance;
 	struct btree_iter rebalance_work_iter, extent_iter = {};
 	struct bkey_s_c k;
+	u32 kick = r->kick;
 	int ret = 0;
 
 	bch2_trans_begin(trans);
@@ -593,7 +594,8 @@ static int do_rebalance(struct moving_context *ctxt)
 	if (!ret &&
 	    !kthread_should_stop() &&
 	    !atomic64_read(&r->work_stats.sectors_seen) &&
-	    !atomic64_read(&r->scan_stats.sectors_seen)) {
+	    !atomic64_read(&r->scan_stats.sectors_seen) &&
+	    kick == r->kick) {
 		bch2_moving_ctxt_flush_all(ctxt);
 		bch2_trans_unlock_long(trans);
 		rebalance_wait(c);
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 5d9214fe1a22..7a565ea7dbfc 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -39,13 +39,11 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *);
 
 static inline void bch2_rebalance_wakeup(struct bch_fs *c)
 {
-	struct task_struct *p;
-
-	rcu_read_lock();
-	p = rcu_dereference(c->rebalance.thread);
+	c->rebalance.kick++;
+	guard(rcu)();
+	struct task_struct *p = rcu_dereference(c->rebalance.thread);
 	if (p)
 		wake_up_process(p);
-	rcu_read_unlock();
 }
 
 void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 33d77286f1d5..c659da149fa3 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -18,6 +18,7 @@ enum bch_rebalance_states {
 
 struct bch_fs_rebalance {
 	struct task_struct __rcu	*thread;
+	u32				kick;
 	struct bch_pd_controller pd;
 
 	enum bch_rebalance_states	state;

From 1cda5b88e6d13ebf42078253abbb2ed0efe9ab0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 19:53:03 -0400
Subject: [PATCH 06/69] bcachefs: Fix missing commit in check_dirents

Other repair code seems to be doing commits themselves, but
check_key_has_snapshot() does not.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 49f46df8340e..8e07a365b24c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2312,9 +2312,10 @@ int bch2_check_dirents(struct bch_fs *c)
 	snapshots_seen_init(&s);
 
 	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_dirents,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
 				POS(BCACHEFS_ROOT_INO, 0),
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
 		check_subdir_count_notnested(trans, &dir));
 

From 686db67a8ebecdc6eb7b9ca8ef8eddb99bdf1083 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 May 2025 11:51:33 -0400
Subject: [PATCH 07/69] bcachefs: Move unicode message to after the startup
 message

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 11579b74c640..df42a66b8bc3 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1038,10 +1038,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 		ret = -EINVAL;
 		goto err;
 	}
-	bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
-		 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
-		 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
-		 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
 #else
 	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
 		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
@@ -1159,6 +1155,12 @@ int bch2_fs_start(struct bch_fs *c)
 
 	print_mount_opts(c);
 
+	if (IS_ENABLED(CONFIG_UNICODE))
+		bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
+			 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+			 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+			 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+
 	if (!bch2_fs_may_start(c))
 		return -BCH_ERR_insufficient_devices_to_start;
 

From 72ab5136e86fcbccebb4a423d83332f41a7bd697 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 11:12:53 -0400
Subject: [PATCH 08/69] bcachefs: Don't rewind to run a recovery pass we
 already ran

Fix a small regression from the "run recovery passes" rewrite, which
enabled async recovery passes.

This fixes getting stuck in a loop in recovery.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery_passes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index dabb29b08ad0..212658cb97dd 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -315,7 +315,9 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 		goto out;
 
 	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
-	bool rewind = in_recovery && r->curr_pass > pass;
+	bool rewind = in_recovery &&
+		r->curr_pass > pass &&
+		!(r->passes_complete & BIT_ULL(pass));
 	bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
 
 	if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) {

From cd04497b10e6178a7510329465d05788b906ce5f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 12:21:57 -0400
Subject: [PATCH 09/69] bcachefs: Journal read error message improvements

- Don't print a checksum error when we first read a journal entry: we
  print a checksum error later if we'll be using the journal entry.

- Continuing with the theme of of improving error messages and grouping
  errors into a single log message per error, print a single 'checksum
  error' message per journal entry, and use bch2_journal_ptr_to_text()
  to print out where on the device it was.

- Factor out checksum error messages and checking for missing journal
  entries into helpers, bch2_journal_read() has gotten obnoxiously big.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 230 ++++++++++++++++++++++++---------------
 fs/bcachefs/journal_io.h |   1 +
 2 files changed, 141 insertions(+), 90 deletions(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 63bb207208b2..a322c3d7123a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -49,14 +49,31 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
 	mutex_unlock(&c->sb_lock);
 }
 
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct journal_replay *j)
+static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
+{
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
+	prt_printf(out, "%s %u:%u:%u (sector %llu)",
+		   ca ? ca->name : "(invalid dev)",
+		   p->dev, p->bucket, p->bucket_offset, p->sector);
+	bch2_dev_put(ca);
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
 {
 	darray_for_each(j->ptrs, i) {
 		if (i != j->ptrs.data)
 			prt_printf(out, " ");
-		prt_printf(out, "%u:%u:%u (sector %llu)",
-			   i->dev, i->bucket, i->bucket_offset, i->sector);
+		bch2_journal_ptr_to_text(out, c, i);
+	}
+}
+
+static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
+{
+	for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
+		struct jset_entry_datetime *datetime =
+			container_of(entry, struct jset_entry_datetime, entry);
+		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+		break;
 	}
 }
 
@@ -64,15 +81,9 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
 					struct journal_replay *j)
 {
 	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-
+	bch2_journal_datetime_to_text(out, &j->j);
+	prt_char(out, ' ');
 	bch2_journal_ptrs_to_text(out, c, j);
-
-	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
-		struct jset_entry_datetime *datetime =
-			container_of(entry, struct jset_entry_datetime, entry);
-		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-		break;
-	}
 }
 
 static struct nonce journal_nonce(const struct jset *jset)
@@ -1037,7 +1048,6 @@ static int journal_read_bucket(struct bch_dev *ca,
 	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
 	    end = offset + ca->mi.bucket_size;
 	bool saw_bad = false, csum_good;
-	struct printbuf err = PRINTBUF;
 	int ret = 0;
 
 	pr_debug("reading %u", bucket);
@@ -1078,7 +1088,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 				 * found on a different device, and missing or
 				 * no journal entries will be handled later
 				 */
-				goto out;
+				return 0;
 			}
 
 			j = buf->data;
@@ -1095,12 +1105,12 @@ static int journal_read_bucket(struct bch_dev *ca,
 				ret = journal_read_buf_realloc(buf,
 							vstruct_bytes(j));
 				if (ret)
-					goto err;
+					return ret;
 			}
 			goto reread;
 		case JOURNAL_ENTRY_NONE:
 			if (!saw_bad)
-				goto out;
+				return 0;
 			/*
 			 * On checksum error we don't really trust the size
 			 * field of the journal entry we read, so try reading
@@ -1109,7 +1119,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 			sectors = block_sectors(c);
 			goto next_block;
 		default:
-			goto err;
+			return ret;
 		}
 
 		if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
@@ -1126,22 +1136,20 @@ static int journal_read_bucket(struct bch_dev *ca,
 		 * bucket:
 		 */
 		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-			goto out;
+			return 0;
 
 		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
-		enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
 		struct bch_csum csum;
 		csum_good = jset_csum_good(c, j, &csum);
 
 		bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
 
 		if (!csum_good) {
-			bch_err_dev_ratelimited(ca, "%s",
-				(printbuf_reset(&err),
-				 prt_str(&err, "journal "),
-				 bch2_csum_err_msg(&err, csum_type, j->csum, csum),
-				 err.buf));
+			/*
+			 * Don't print an error here, we'll print the error
+			 * later if we need this journal entry
+			 */
 			saw_bad = true;
 		}
 
@@ -1153,6 +1161,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 		mutex_lock(&jlist->lock);
 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
 					.csum_good	= csum_good,
+					.csum		= csum,
 					.dev		= ca->dev_idx,
 					.bucket		= bucket,
 					.bucket_offset	= offset -
@@ -1167,7 +1176,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
 			break;
 		default:
-			goto err;
+			return ret;
 		}
 next_block:
 		pr_debug("next");
@@ -1176,11 +1185,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 		j = ((void *) j) + (sectors << 9);
 	}
 
-out:
-	ret = 0;
-err:
-	printbuf_exit(&err);
-	return ret;
+	return 0;
 }
 
 static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1229,13 +1234,105 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 	goto out;
 }
 
+noinline_for_stack
+static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
+{
+	struct printbuf buf = PRINTBUF;
+	enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
+	bool have_good = false;
+
+	prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
+	bch2_journal_datetime_to_text(&buf, &j->j);
+	prt_newline(&buf);
+
+	darray_for_each(j->ptrs, ptr)
+		if (!ptr->csum_good) {
+			bch2_journal_ptr_to_text(&buf, c, ptr);
+			prt_char(&buf, ' ');
+			bch2_csum_to_text(&buf, csum_type, ptr->csum);
+			prt_newline(&buf);
+		} else {
+			have_good = true;
+		}
+
+	prt_printf(&buf, "should be ");
+	bch2_csum_to_text(&buf, csum_type, j->j.csum);
+
+	if (have_good)
+		prt_printf(&buf, "\n(had good copy on another device)");
+
+	bch2_print_str(c, KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
+{
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	struct genradix_iter radix_iter;
+	struct journal_replay *i, **_i, *prev = NULL;
+	u64 seq = start_seq;
+
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (journal_replay_ignore(i))
+			continue;
+
+		BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+		while (seq < le64_to_cpu(i->j.seq)) {
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (seq == le64_to_cpu(i->j.seq))
+				break;
+
+			u64 missing_start = seq;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       !bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			u64 missing_end = seq - 1;
+
+			printbuf_reset(&buf);
+			prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+				   missing_start, missing_end,
+				   start_seq, end_seq);
+
+			prt_printf(&buf, "\nprev at ");
+			if (prev) {
+				bch2_journal_ptrs_to_text(&buf, c, prev);
+				prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+			} else
+				prt_printf(&buf, "(none)");
+
+			prt_printf(&buf, "\nnext at ");
+			bch2_journal_ptrs_to_text(&buf, c, i);
+			prt_printf(&buf, ", continue?");
+
+			fsck_err(c, journal_entries_missing, "%s", buf.buf);
+		}
+
+		prev = i;
+		seq++;
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
 int bch2_journal_read(struct bch_fs *c,
 		      u64 *last_seq,
 		      u64 *blacklist_seq,
 		      u64 *start_seq)
 {
 	struct journal_list jlist;
-	struct journal_replay *i, **_i, *prev = NULL;
+	struct journal_replay *i, **_i;
 	struct genradix_iter radix_iter;
 	struct printbuf buf = PRINTBUF;
 	bool degraded = false, last_write_torn = false;
@@ -1354,56 +1451,9 @@ int bch2_journal_read(struct bch_fs *c,
 		}
 	}
 
-	/* Check for missing entries: */
-	seq = *last_seq;
-	genradix_for_each(&c->journal_entries, radix_iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		BUG_ON(seq > le64_to_cpu(i->j.seq));
-
-		while (seq < le64_to_cpu(i->j.seq)) {
-			u64 missing_start, missing_end;
-			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			if (seq == le64_to_cpu(i->j.seq))
-				break;
-
-			missing_start = seq;
-
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       !bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			if (prev) {
-				bch2_journal_ptrs_to_text(&buf1, c, prev);
-				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
-			} else
-				prt_printf(&buf1, "(none)");
-			bch2_journal_ptrs_to_text(&buf2, c, i);
-
-			missing_end = seq - 1;
-			fsck_err(c, journal_entries_missing,
-				 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
-				 "prev at %s\n"
-				 "next at %s, continue?",
-				 missing_start, missing_end,
-				 *last_seq, *blacklist_seq - 1,
-				 buf1.buf, buf2.buf);
-
-			printbuf_exit(&buf1);
-			printbuf_exit(&buf2);
-		}
-
-		prev = i;
-		seq++;
-	}
+	ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1);
+	if (ret)
+		goto err;
 
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		union bch_replicas_padded replicas = {
@@ -1416,15 +1466,15 @@ int bch2_journal_read(struct bch_fs *c,
 		if (journal_replay_ignore(i))
 			continue;
 
-		darray_for_each(i->ptrs, ptr) {
-			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
-			if (!ptr->csum_good)
-				bch_err_dev_offset(ca, ptr->sector,
-						   "invalid journal checksum, seq %llu%s",
-						   le64_to_cpu(i->j.seq),
-						   i->csum_good ? " (had good copy on another device)" : "");
-		}
+		/*
+		 * Don't print checksum errors until we know we're going to use
+		 * a given journal entry:
+		 */
+		darray_for_each(i->ptrs, ptr)
+			if (!ptr->csum_good) {
+				bch2_journal_print_checksum_error(c, i);
+				break;
+			}
 
 		ret = jset_validate(c,
 				    bch2_dev_have_ref(c, i->ptrs.data[0].dev),
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 12b39fcb4424..6fa82c4050fe 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
 
 struct journal_ptr {
 	bool		csum_good;
+	struct bch_csum	csum;
 	u8		dev;
 	u32		bucket;
 	u32		bucket_offset;

From d6efd42a8450c67d283dbaacd127dcccca858f51 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 May 2025 17:04:11 -0400
Subject: [PATCH 10/69] bcachefs: Fix infinite loop in
 journal_entry_btree_keys_to_text()

Fix an infinite loop when bkey_i->k.u64s is 0.

This only happens in userspace, where 'bcachefs list_journal' can print
the entire contents of the journal, and non-dirty entries aren't
validated.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index a322c3d7123a..30122abf3e2c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -429,6 +429,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 	bool first = true;
 
 	jset_entry_for_each_key(entry, k) {
+		/* We may be called on entries that haven't been validated: */
+		if (!k->k.u64s)
+			break;
+
 		if (!first) {
 			prt_newline(out);
 			bch2_prt_jset_entry_type(out, entry->type);

From 060ff4b794748ef5d290c9b58075453ed4eac59d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 23:00:21 -0400
Subject: [PATCH 11/69] bcachefs: trace_io_move_pred

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c  | 88 +++++++++++++++++++++++++++++++--------------
 fs/bcachefs/trace.h |  5 +++
 2 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 79f4722621d5..f67656fbd849 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,30 +38,61 @@ const char * const bch2_data_ops_strs[] = {
 	NULL
 };
 
-static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
-			       struct bch_io_opts *io_opts,
-			       struct data_update_opts *data_opts)
-{
-	if (trace_io_move_enabled()) {
-		struct printbuf buf = PRINTBUF;
+struct evacuate_bucket_arg {
+	struct bpos		bucket;
+	int			gen;
+	struct data_update_opts	data_opts;
+};
 
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_newline(&buf);
-		bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
-		trace_io_move(c, buf.buf);
-		printbuf_exit(&buf);
-	}
+static bool evacuate_bucket_pred(struct bch_fs *, void *,
+				 enum btree_id, struct bkey_s_c,
+				 struct bch_io_opts *,
+				 struct data_update_opts *);
+
+static noinline void
+trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
+	       struct bch_io_opts *io_opts,
+	       struct data_update_opts *data_opts)
+{
+	struct printbuf buf = PRINTBUF;
+
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_newline(&buf);
+	bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+	trace_io_move(c, buf.buf);
+	printbuf_exit(&buf);
 }
 
-static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
+static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
 {
-	if (trace_io_move_read_enabled()) {
-		struct printbuf buf = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
 
-		bch2_bkey_val_to_text(&buf, c, k);
-		trace_io_move_read(c, buf.buf);
-		printbuf_exit(&buf);
+	bch2_bkey_val_to_text(&buf, c, k);
+	trace_io_move_read(c, buf.buf);
+	printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
+		    struct bch_io_opts *io_opts,
+		    struct data_update_opts *data_opts,
+		    move_pred_fn pred, void *_arg, bool p)
+{
+	struct printbuf buf = PRINTBUF;
+
+	prt_printf(&buf, "%ps: %u", pred, p);
+
+	if (pred == evacuate_bucket_pred) {
+		struct evacuate_bucket_arg *arg = _arg;
+		prt_printf(&buf, " gen=%u", arg->gen);
 	}
+
+	prt_newline(&buf);
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_newline(&buf);
+	bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+	trace_io_move_pred(c, buf.buf);
+	printbuf_exit(&buf);
 }
 
 struct moving_io {
@@ -298,7 +329,8 @@ int bch2_move_extent(struct moving_context *ctxt,
 	struct bch_fs *c = trans->c;
 	int ret = -ENOMEM;
 
-	trace_io_move2(c, k, &io_opts, &data_opts);
+	if (trace_io_move_enabled())
+		trace_io_move2(c, k, &io_opts, &data_opts);
 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 
 	if (ctxt->stats)
@@ -364,7 +396,8 @@ int bch2_move_extent(struct moving_context *ctxt,
 		atomic_inc(&io->b->count);
 	}
 
-	trace_io_move_read2(c, k);
+	if (trace_io_move_read_enabled())
+		trace_io_move_read2(c, k);
 
 	mutex_lock(&ctxt->lock);
 	atomic_add(io->read_sectors, &ctxt->read_sectors);
@@ -496,6 +529,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
 		bch2_inode_opts_get(io_opts, c, &inode);
 	}
 	bch2_trans_iter_exit(trans, &inode_iter);
+	/* seem to be spinning here? */
 out:
 	return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
 }
@@ -910,7 +944,13 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
 		}
 
 		struct data_update_opts data_opts = {};
-		if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) {
+		bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
+
+		if (trace_io_move_pred_enabled())
+			trace_io_move_pred2(c, k, &io_opts, &data_opts,
+					    pred, arg, p);
+
+		if (!p) {
 			bch2_trans_iter_exit(trans, &iter);
 			goto next;
 		}
@@ -993,12 +1033,6 @@ int bch2_move_data_phys(struct bch_fs *c,
 	return ret;
 }
 
-struct evacuate_bucket_arg {
-	struct bpos		bucket;
-	int			gen;
-	struct data_update_opts	data_opts;
-};
-
 static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
 				 enum btree_id btree, struct bkey_s_c k,
 				 struct bch_io_opts *io_opts,
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 8cb5b40704fd..8f942913cec3 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1431,6 +1431,11 @@ DEFINE_EVENT(fs_str, data_update,
 	TP_ARGS(c, str)
 );
 
+DEFINE_EVENT(fs_str, io_move_pred,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
 DEFINE_EVENT(fs_str, io_move_created_rebalance,
 	TP_PROTO(struct bch_fs *c, const char *str),
 	TP_ARGS(c, str)

From c7897b5055df20e954c269897052c9397e0fff1a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 21:54:22 -0400
Subject: [PATCH 12/69] bcachefs: io_move_evacuate_bucket tracepoint, counter

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c               | 18 ++++++++++++++++++
 fs/bcachefs/sb-counters_format.h |  1 +
 fs/bcachefs/trace.h              |  5 +++++
 3 files changed, 24 insertions(+)

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f67656fbd849..4dd779b7b6d2 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -95,6 +95,19 @@ trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
 	printbuf_exit(&buf);
 }
 
+static noinline void
+trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
+{
+	struct printbuf buf = PRINTBUF;
+
+	prt_printf(&buf, "bucket: ");
+	bch2_bpos_to_text(&buf, bucket);
+	prt_printf(&buf, " gen: %i\n", gen);
+
+	trace_io_move_evacuate_bucket(c, buf.buf);
+	printbuf_exit(&buf);
+}
+
 struct moving_io {
 	struct list_head		read_list;
 	struct list_head		io_list;
@@ -1059,8 +1072,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 			 struct bpos bucket, int gen,
 			 struct data_update_opts data_opts)
 {
+	struct bch_fs *c = ctxt->trans->c;
 	struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
 
+	count_event(c, io_move_evacuate_bucket);
+	if (trace_io_move_evacuate_bucket_enabled())
+		trace_io_move_evacuate_bucket2(c, bucket, gen);
+
 	return __bch2_move_data_phys(ctxt, bucket_in_flight,
 				   bucket.inode,
 				   bucket.offset,
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 7c0c9c842b4e..b868702a431a 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -26,6 +26,7 @@ enum counters_flags {
 	x(io_move_write_fail,				82,	TYPE_COUNTER)	\
 	x(io_move_start_fail,				39,	TYPE_COUNTER)	\
 	x(io_move_created_rebalance,			83,	TYPE_COUNTER)	\
+	x(io_move_evacuate_bucket,			84,	TYPE_COUNTER)	\
 	x(bucket_invalidate,				3,	TYPE_COUNTER)	\
 	x(bucket_discard,				4,	TYPE_COUNTER)	\
 	x(bucket_discard_fast,				79,	TYPE_COUNTER)	\
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 8f942913cec3..eb3ca963fc58 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1441,6 +1441,11 @@ DEFINE_EVENT(fs_str, io_move_created_rebalance,
 	TP_ARGS(c, str)
 );
 
+DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
 TRACE_EVENT(error_downcast,
 	TP_PROTO(int bch_err, int std_err, unsigned long ip),
 	TP_ARGS(bch_err, std_err, ip),

From 327971cef509c6380ed4bd8587fad26ceb2ab90b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 21:45:56 -0400
Subject: [PATCH 13/69] bcachefs: Catch data_update_done events in
 trace_io_move_start_fail

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/move.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4dd779b7b6d2..857519cdac4e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -436,9 +436,6 @@ int bch2_move_extent(struct moving_context *ctxt,
 err_free:
 	kfree(io);
 err:
-	if (bch2_err_matches(ret, BCH_ERR_data_update_done))
-		return 0;
-
 	if (bch2_err_matches(ret, EROFS) ||
 	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		return ret;
@@ -454,6 +451,9 @@ int bch2_move_extent(struct moving_context *ctxt,
 		trace_io_move_start_fail(c, buf.buf);
 		printbuf_exit(&buf);
 	}
+
+	if (bch2_err_matches(ret, BCH_ERR_data_update_done))
+		return 0;
 	return ret;
 }
 

From 813825d24135aa3109cf7746ce11f1ec2ab901fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 20:37:50 -0400
Subject: [PATCH 14/69] bcachefs: Fix incorrect multiple dev check in journal
 write path

It's uncomon to have multiple devices with journalling only on a subset,
but can be specified with the 'data_allowed' option. We need to know if
we're doing data/metadata writes to multiple devices, as that requires
issuing flushes before the journal writes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 30122abf3e2c..527ce6f74b63 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1635,6 +1635,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 done:
 	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
 
+#if 0
+	/*
+	 * XXX: we need a way to alert the user when we go degraded for any
+	 * reason
+	 */
+	if (*replicas < min(replicas_want,
+			    dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
+	}
+#endif
+
 	return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
 }
 
@@ -2112,7 +2122,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	union bch_replicas_padded replicas;
-	unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]);
+	unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
 	int ret;
 
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));

From f54b2a80d0df06f8c13cbd102ca9dc2c6a578c5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 22:06:04 -0400
Subject: [PATCH 15/69] bcachefs: Fix misaligned bucket check in journal space
 calculations

Fix an assertion pop in the tiering_misaligned test: rounding down to
bucket size at the end of the journal space calculations leaves
cur_entry_sectors == 0, which is incorrect with !cur_entry_err.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_reclaim.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 70f36f6bc482..d0604218bf65 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -83,18 +83,20 @@ static struct journal_space
 journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			    enum journal_space_from from)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_device *ja = &ca->journal;
 	unsigned sectors, buckets, unwritten;
+	unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
 	u64 seq;
 
 	if (from == journal_space_total)
 		return (struct journal_space) {
-			.next_entry	= ca->mi.bucket_size,
-			.total		= ca->mi.bucket_size * ja->nr,
+			.next_entry	= bucket_size_aligned,
+			.total		= bucket_size_aligned * ja->nr,
 		};
 
 	buckets = bch2_journal_dev_buckets_available(j, ja, from);
-	sectors = ja->sectors_free;
+	sectors = round_down(ja->sectors_free, block_sectors(c));
 
 	/*
 	 * We that we don't allocate the space for a journal entry
@@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			continue;
 
 		/* entry won't fit on this device, skip: */
-		if (unwritten > ca->mi.bucket_size)
+		if (unwritten > bucket_size_aligned)
 			continue;
 
 		if (unwritten >= sectors) {
@@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 			}
 
 			buckets--;
-			sectors = ca->mi.bucket_size;
+			sectors = bucket_size_aligned;
 		}
 
 		sectors -= unwritten;
@@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 
 	if (sectors < ca->mi.bucket_size && buckets) {
 		buckets--;
-		sectors = ca->mi.bucket_size;
+		sectors = bucket_size_aligned;
 	}
 
 	return (struct journal_space) {
 		.next_entry	= sectors,
-		.total		= sectors + buckets * ca->mi.bucket_size,
+		.total		= sectors + buckets * bucket_size_aligned,
 	};
 }
 
@@ -256,8 +258,7 @@ void bch2_journal_space_available(struct journal *j)
 	bch2_journal_set_watermark(j);
 out:
 	j->cur_entry_sectors	= !ret
-		? round_down(j->space[journal_space_discarded].next_entry,
-			     block_sectors(c))
+		? j->space[journal_space_discarded].next_entry
 		: 0;
 	j->cur_entry_error	= ret;
 

From 99813d88e371c3a60d65810125b8d1568364f254 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 20:37:21 -0400
Subject: [PATCH 16/69] bcachefs: Add missing error logging in
 delete_dead_inodes()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 5cf70108ae2f..5b35dae89c13 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1533,5 +1533,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 		goto again;
 err:
 	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
 	return ret;
 }

From 0d25264ecfa3868c9308684fe344da1eb2f4502c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 May 2025 17:56:45 -0400
Subject: [PATCH 17/69] bcachefs: Kill bkey_buf in btree_path_down()

Allocate some (smaller) temporary storage in btree_trans for this -
btree_path_down() is in our max-stack call stack.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c  | 38 ++++++++++++++++----------------------
 fs/bcachefs/btree_types.h |  2 ++
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index b4bf4217a3fa..6303531b4c5b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -890,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
 
 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
 						     struct btree_path *path,
-						     unsigned flags,
-						     struct bkey_buf *out)
+						     unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_path_level *l = path_l(path);
@@ -915,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
 		goto err;
 	}
 
-	bch2_bkey_buf_reassemble(out, c, k);
+	bkey_reassemble(&trans->btree_path_down, k);
 
 	if ((flags & BTREE_ITER_prefetch) &&
 	    c->opts.btree_node_prefetch)
@@ -936,20 +935,17 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	struct btree *b;
 	unsigned level = path->level - 1;
 	enum six_lock_type lock_type = __btree_lock_want(path, level);
-	struct bkey_buf tmp;
 	int ret;
 
 	EBUG_ON(!btree_node_locked(path, path->level));
 
-	bch2_bkey_buf_init(&tmp);
-
 	if (unlikely(trans->journal_replay_not_finished)) {
-		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+		ret = btree_node_iter_and_journal_peek(trans, path, flags);
 		if (ret)
-			goto err;
+			return ret;
 	} else {
 		struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
-		if (!k) {
+		if (unlikely(!k)) {
 			struct printbuf buf = PRINTBUF;
 
 			prt_str(&buf, "node not found at pos ");
@@ -959,28 +955,28 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 
 			bch2_fs_fatal_error(c, "%s", buf.buf);
 			printbuf_exit(&buf);
-			ret = -BCH_ERR_btree_need_topology_repair;
-			goto err;
+			return -BCH_ERR_btree_need_topology_repair;
 		}
 
-		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+		bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
 
-		if ((flags & BTREE_ITER_prefetch) &&
+		if (unlikely((flags & BTREE_ITER_prefetch)) &&
 		    c->opts.btree_node_prefetch) {
 			ret = btree_path_prefetch(trans, path);
 			if (ret)
-				goto err;
+				return ret;
 		}
 	}
 
-	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+	b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
+				level, lock_type, trace_ip);
 	ret = PTR_ERR_OR_ZERO(b);
 	if (unlikely(ret))
-		goto err;
+		return ret;
 
-	if (likely(!trans->journal_replay_not_finished &&
-		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
-	    unlikely(b != btree_node_mem_ptr(tmp.k)))
+	if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
+	    likely(!trans->journal_replay_not_finished &&
+		   trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
 		btree_node_mem_ptr_set(trans, path, level + 1, b);
 
 	if (btree_node_read_locked(path, level + 1))
@@ -992,9 +988,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 	bch2_btree_path_level_init(trans, path, b);
 
 	bch2_btree_path_verify_locks(trans, path);
-err:
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
+	return 0;
 }
 
 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9d641bf9d2a2..c61c4171ae50 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -555,6 +555,8 @@ struct btree_trans {
 	unsigned		journal_u64s;
 	unsigned		extra_disk_res; /* XXX kill */
 
+	__BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif

From 19c0a8aa8ae302a3b038283c7d7eee0371ae3bf5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 12:48:19 -0400
Subject: [PATCH 18/69] bcachefs: btree_node_missing_err()

Factor out an error path for a small stack usage improvement.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6303531b4c5b..5affa5fc22f4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -925,6 +925,22 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
 	return ret;
 }
 
+static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
+						     struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	prt_str(&buf, "node not found at pos ");
+	bch2_bpos_to_text(&buf, path->pos);
+	prt_str(&buf, " within parent node ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
+
+	bch2_fs_fatal_error(c, "%s", buf.buf);
+	printbuf_exit(&buf);
+	return -BCH_ERR_btree_need_topology_repair;
+}
+
 static __always_inline int btree_path_down(struct btree_trans *trans,
 					   struct btree_path *path,
 					   unsigned flags,
@@ -945,18 +961,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 			return ret;
 	} else {
 		struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
-		if (unlikely(!k)) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "node not found at pos ");
-			bch2_bpos_to_text(&buf, path->pos);
-			prt_str(&buf, " within parent node ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
-			bch2_fs_fatal_error(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-			return -BCH_ERR_btree_need_topology_repair;
-		}
+		if (unlikely(!k))
+			return btree_node_missing_err(trans, path);
 
 		bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
 

From cd831a9494524726babf835434da1438bcff6f45 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 16:29:56 -0400
Subject: [PATCH 19/69] bcachefs: factor out break_cycle_fail()

More stack usage work.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c | 53 ++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 2f2aed0c9916..09ae5a8c6874 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -194,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans)
 	return 3;
 }
 
+static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
+{
+	struct printbuf buf = PRINTBUF;
+	buf.atomic++;
+
+	prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+	for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
+		struct btree_trans *trans = i->trans;
+
+		bch2_btree_trans_to_text(&buf, trans);
+
+		prt_printf(&buf, "backtrace:\n");
+		printbuf_indent_add(&buf, 2);
+		bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
+		printbuf_indent_sub(&buf, 2);
+		prt_newline(&buf);
+	}
+
+	bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+	BUG();
+}
+
 static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
 				struct trans_waiting_for_lock *from)
 {
@@ -219,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
 		}
 	}
 
-	if (unlikely(!best)) {
-		struct printbuf buf = PRINTBUF;
-		buf.atomic++;
-
-		prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
-		for (i = g->g; i < g->g + g->nr; i++) {
-			struct btree_trans *trans = i->trans;
-
-			bch2_btree_trans_to_text(&buf, trans);
-
-			prt_printf(&buf, "backtrace:\n");
-			printbuf_indent_add(&buf, 2);
-			bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
-			printbuf_indent_sub(&buf, 2);
-			prt_newline(&buf);
-		}
-
-		bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-		BUG();
-	}
+	if (unlikely(!best))
+		break_cycle_fail(g);
 
 	ret = abort_lock(g, abort);
 out:
@@ -255,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
 			      struct printbuf *cycle)
 {
 	struct btree_trans *orig_trans = g->g->trans;
-	struct trans_waiting_for_lock *i;
 
-	for (i = g->g; i < g->g + g->nr; i++)
+	for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
 		if (i->trans == trans) {
 			closure_put(&trans->ref);
 			return break_cycle(g, cycle, i);
 		}
 
-	if (g->nr == ARRAY_SIZE(g->g)) {
+	if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
 		closure_put(&trans->ref);
 
 		if (orig_trans->lock_may_not_fail)

From 92caf17189a5b32bb9349a7a1f329cdd5fa51eb4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 16:16:17 -0400
Subject: [PATCH 20/69] bcachefs: Don't stack allocate bch_writepage_state

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index e3a75dcca60c..66bacdd49f78 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -394,17 +394,9 @@ struct bch_writepage_state {
 	struct bch_io_opts	opts;
 	struct bch_folio_sector	*tmp;
 	unsigned		tmp_sectors;
+	struct blk_plug		plug;
 };
 
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
-								  struct bch_inode_info *inode)
-{
-	struct bch_writepage_state ret = { 0 };
-
-	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
-	return ret;
-}
-
 /*
  * Determine when a writepage io is full. We have to limit writepage bios to a
  * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
@@ -666,17 +658,17 @@ static int __bch2_writepage(struct folio *folio,
 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state w =
-		bch_writepage_state_init(c, to_bch_ei(mapping->host));
-	struct blk_plug plug;
-	int ret;
+	struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
 
-	blk_start_plug(&plug);
-	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
-	if (w.io)
-		bch2_writepage_do_io(&w);
-	blk_finish_plug(&plug);
-	kfree(w.tmp);
+	bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
+
+	blk_start_plug(&w->plug);
+	int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+	if (w->io)
+		bch2_writepage_do_io(w);
+	blk_finish_plug(&w->plug);
+	kfree(w->tmp);
+	kfree(w);
 	return bch2_err_class(ret);
 }
 

From 56e5c7f65f582d60e900f356f0dff542b90630b1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 14:15:28 -0400
Subject: [PATCH 21/69] bcachefs: kill replicas_sectors arg to
 __trigger_extent()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 492a368a9993..b9e92bd55e1c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -740,8 +740,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 static int __trigger_extent(struct btree_trans *trans,
 			    enum btree_id btree_id, unsigned level,
 			    struct bkey_s_c k,
-			    enum btree_iter_update_trigger_flags flags,
-			    s64 *replicas_sectors)
+			    enum btree_iter_update_trigger_flags flags)
 {
 	bool gc = flags & BTREE_TRIGGER_gc;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -752,6 +751,8 @@ static int __trigger_extent(struct btree_trans *trans,
 		: BCH_DATA_user;
 	int ret = 0;
 
+	s64 replicas_sectors = 0;
+
 	struct disk_accounting_pos acc_replicas_key;
 	memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
 	acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
@@ -778,7 +779,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			if (ret)
 				return ret;
 		} else if (!p.has_ec) {
-			*replicas_sectors       += disk_sectors;
+			replicas_sectors       += disk_sectors;
 			replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
 		} else {
 			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -816,13 +817,13 @@ static int __trigger_extent(struct btree_trans *trans,
 	}
 
 	if (acc_replicas_key.replicas.nr_devs) {
-		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	}
 
 	if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
-		ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
+		ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
 		if (ret)
 			return ret;
 	}
@@ -838,7 +839,7 @@ static int __trigger_extent(struct btree_trans *trans,
 	}
 
 	if (level) {
-		ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
+		ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
 		if (ret)
 			return ret;
 	} else {
@@ -847,7 +848,7 @@ static int __trigger_extent(struct btree_trans *trans,
 		s64 v[3] = {
 			insert ? 1 : -1,
 			insert ? k.k->size : -((s64) k.k->size),
-			*replicas_sectors,
+			replicas_sectors,
 		};
 		ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
 		if (ret)
@@ -879,20 +880,16 @@ int bch2_trigger_extent(struct btree_trans *trans,
 		return 0;
 
 	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-		s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
-
 		if (old.k->type) {
 			int ret = __trigger_extent(trans, btree, level, old,
-						   flags & ~BTREE_TRIGGER_insert,
-						   &old_replicas_sectors);
+						   flags & ~BTREE_TRIGGER_insert);
 			if (ret)
 				return ret;
 		}
 
 		if (new.k->type) {
 			int ret = __trigger_extent(trans, btree, level, new.s_c,
-						   flags & ~BTREE_TRIGGER_overwrite,
-						   &new_replicas_sectors);
+						   flags & ~BTREE_TRIGGER_overwrite);
 			if (ret)
 				return ret;
 		}

From 0c34e7ff69036ffcb006ee86fd0143f3620dd491 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 13:26:10 -0400
Subject: [PATCH 22/69] bcachefs: Tweak bch2_data_update_init() for stack usage

- Separate out a slowpath for bkey_nocow_lock()
- Don't call bch2_bkey_ptrs_c() or loop over pointers more than
  necessary

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 111 +++++++++++++++++++++++---------------
 1 file changed, 67 insertions(+), 44 deletions(-)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index c34e5b88ba9d..1216729c20e8 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -66,37 +66,46 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
 	}
 }
 
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+static noinline_for_stack
+bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
+		       const struct bch_extent_ptr *start)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	if (!ctxt) {
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (ptr == start)
+				break;
 
+			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+			struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+			bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+		}
+		return false;
+	}
+
+	__bkey_for_each_ptr(start, ptrs.end, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		bool locked;
+		move_ctxt_wait_event(ctxt,
+				     (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+				     list_empty(&ctxt->ios));
+		if (!locked)
+			bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+	}
+	return true;
+}
+
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
+{
 	bkey_for_each_ptr(ptrs, ptr) {
 		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
 		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
 
-		if (ctxt) {
-			bool locked;
-
-			move_ctxt_wait_event(ctxt,
-				(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
-				list_empty(&ctxt->ios));
-
-			if (!locked)
-				bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-		} else {
-			if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
-				bkey_for_each_ptr(ptrs, ptr2) {
-					if (ptr2 == ptr)
-						break;
-
-					ca = bch2_dev_have_ref(c, ptr2->dev);
-					bucket = PTR_BUCKET_POS(ca, ptr2);
-					bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-				}
-				return false;
-			}
-		}
+		if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
+			return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
 	}
+
 	return true;
 }
 
@@ -523,8 +532,9 @@ void bch2_data_update_exit(struct data_update *update)
 	bch2_bkey_buf_exit(&update->k, c);
 }
 
-static int bch2_update_unwritten_extent(struct btree_trans *trans,
-					struct data_update *update)
+static noinline_for_stack
+int bch2_update_unwritten_extent(struct btree_trans *trans,
+				 struct data_update *update)
 {
 	struct bch_fs *c = update->op.c;
 	struct bkey_i_extent *e;
@@ -716,18 +726,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
 		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 }
 
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
-			       struct bch_io_opts *io_opts)
+static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+					struct bch_io_opts *io_opts,
+					unsigned buf_bytes)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	/* write path might have to decompress data: */
-	unsigned buf_bytes = 0;
-	bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
-		buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
 	unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
 
 	m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
@@ -751,6 +753,21 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
 	return 0;
 }
 
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+			       struct bch_io_opts *io_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	/* write path might have to decompress data: */
+	unsigned buf_bytes = 0;
+	bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
+		buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+
+	return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
+}
+
 static int can_write_extent(struct bch_fs *c, struct data_update *m)
 {
 	if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
@@ -802,10 +819,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 			  struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
 	int ret = 0;
 
 	/*
@@ -842,6 +855,13 @@ int bch2_data_update_init(struct btree_trans *trans,
 
 	unsigned durability_have = 0, durability_removing = 0;
 
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
+	unsigned buf_bytes = 0;
+	bool unwritten = false;
+
 	unsigned ptr_bit = 1;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		if (!p.ptr.cached) {
@@ -872,6 +892,9 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
 			m->op.incompressible = true;
 
+		buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+		unwritten |= p.ptr.unwritten;
+
 		ptr_bit <<= 1;
 	}
 
@@ -946,18 +969,18 @@ int bch2_data_update_init(struct btree_trans *trans,
 	}
 
 	if (c->opts.nocow_enabled &&
-	    !bkey_nocow_lock(c, ctxt, k)) {
+	    !bkey_nocow_lock(c, ctxt, ptrs)) {
 		ret = -BCH_ERR_nocow_lock_blocked;
 		goto out_put_dev_refs;
 	}
 
-	if (bkey_extent_is_unwritten(k)) {
+	if (unwritten) {
 		ret = bch2_update_unwritten_extent(trans, m) ?:
 			-BCH_ERR_data_update_done_unwritten;
 		goto out_nocow_unlock;
 	}
 
-	ret = bch2_data_update_bios_init(m, c, io_opts);
+	ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
 	if (ret)
 		goto out_nocow_unlock;
 

From eabef52ff881e91f41c7bd6696c9355d12fc00c2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 14:24:19 -0400
Subject: [PATCH 23/69] bcachefs: bch2_alloc_v4_to_text()

Specialize the .to_text() for alloc_v4, to avoid the temporary on the
stack for conversion from old versions.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 20 ++++++++++++++++----
 fs/bcachefs/alloc_background.h |  3 ++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 173e81c2bbcb..271d4a2eb15f 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -337,11 +337,10 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 	a->stripe_sectors	= swab32(a->stripe_sectors);
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
+					   unsigned dev, const struct bch_alloc_v4 *a)
 {
-	struct bch_alloc_v4 _a;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-	struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
+	struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
 
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
@@ -369,6 +368,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	bch2_dev_put(ca);
 }
 
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+
+	__bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
+}
+
+void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	__bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
+}
+
 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
 {
 	if (k.k->type == KEY_TYPE_alloc_v4) {
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 4f94c6a661bf..b97ae710219f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -253,6 +253,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
 			   struct bkey_validate_context);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
 	.key_validate	= bch2_alloc_v1_validate,	\
@@ -277,7 +278,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
 	.key_validate	= bch2_alloc_v4_validate,	\
-	.val_to_text	= bch2_alloc_to_text,		\
+	.val_to_text	= bch2_alloc_v4_to_text,	\
 	.swab		= bch2_alloc_v4_swab,		\
 	.trigger	= bch2_trigger_alloc,		\
 	.min_val_size	= 48,				\

From ff6369da9ac366f4a18735902ab8f50f5690ee83 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 17:15:11 -0400
Subject: [PATCH 24/69] bcachefs: reduce stack usage in alloc_sectors_start()

with typical config options, variables in different inline functions
aren't sharing stack space - and these are slowpaths.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 1a52c12c51ae..4341201deec3 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1104,7 +1104,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
 	return stranded * factor > free;
 }
 
-static bool try_increase_writepoints(struct bch_fs *c)
+static noinline bool try_increase_writepoints(struct bch_fs *c)
 {
 	struct write_point *wp;
 
@@ -1117,7 +1117,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
 	return true;
 }
 
-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
 {
 	struct bch_fs *c = trans->c;
 	struct write_point *wp;

From e87de7d4918bd43901685dd1dd5620fc1aafd0aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 May 2025 17:03:48 -0400
Subject: [PATCH 25/69] bcachefs: Move devs_sorted to alloc_request

More stack usage work.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 35 +++++++++++++++++-----------------
 fs/bcachefs/alloc_foreground.h |  8 +++++---
 fs/bcachefs/journal_io.c       |  2 +-
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 4341201deec3..0e7eeb89299c 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -603,18 +603,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
 
 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
 
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
-					  struct dev_stripe_state *stripe,
-					  struct bch_devs_mask *devs)
+void bch2_dev_alloc_list(struct bch_fs *c,
+			 struct dev_stripe_state *stripe,
+			 struct bch_devs_mask *devs,
+			 struct dev_alloc_list *ret)
 {
-	struct dev_alloc_list ret = { .nr = 0 };
+	ret->nr = 0;
+
 	unsigned i;
-
 	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
-		ret.data[ret.nr++] = i;
+		ret->data[ret->nr++] = i;
 
-	bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
-	return ret;
+	bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
 }
 
 static const u64 stripe_clock_hand_rescale	= 1ULL << 62; /* trigger rescale at */
@@ -705,18 +705,19 @@ static int add_new_bucket(struct bch_fs *c,
 	return 0;
 }
 
-int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
-				struct alloc_request *req,
-				struct dev_stripe_state *stripe,
-				struct closure *cl)
+inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+				       struct alloc_request *req,
+				       struct dev_stripe_state *stripe,
+				       struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
 	int ret = -BCH_ERR_insufficient_devices;
 
 	BUG_ON(req->nr_effective >= req->nr_replicas);
 
-	struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc);
-	darray_for_each(devs_sorted, i) {
+	bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
+
+	darray_for_each(req->devs_sorted, i) {
 		req->ca = bch2_dev_tryget_noerror(c, *i);
 		if (!req->ca)
 			continue;
@@ -776,9 +777,9 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
 	if (!h)
 		return 0;
 
-	struct dev_alloc_list devs_sorted =
-		bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc);
-	darray_for_each(devs_sorted, i)
+	bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
+
+	darray_for_each(req->devs_sorted, i)
 		for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
 			if (!h->s->blocks[ec_idx])
 				continue;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 2e01c7b61ed1..1b3fc8460096 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -42,6 +42,7 @@ struct alloc_request {
 	struct bch_devs_mask	devs_may_alloc;
 
 	/* bch2_bucket_alloc_set_trans(): */
+	struct dev_alloc_list	devs_sorted;
 	struct bch_dev_usage	usage;
 
 	/* bch2_bucket_alloc_trans(): */
@@ -71,9 +72,10 @@ struct alloc_request {
 	struct bch_devs_mask	scratch_devs_may_alloc;
 };
 
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
-					  struct dev_stripe_state *,
-					  struct bch_devs_mask *);
+void bch2_dev_alloc_list(struct bch_fs *,
+			 struct dev_stripe_state *,
+			 struct bch_devs_mask *,
+			 struct dev_alloc_list *);
 void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
 
 static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 527ce6f74b63..52297057531b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1613,7 +1613,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
 retry_target:
 	devs = target_rw_devs(c, BCH_DATA_journal, target);
-	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+	bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
 retry_alloc:
 	__journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
 

From a7c9add482c7b0e7e28433816fa9e8f7b890086e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 14:39:43 -0400
Subject: [PATCH 26/69] bcachefs: Include b->ob.nr in
 cached_btree_node_to_text()

We have a bug report that looks like we might be leaking open buckets -
let's check if they got left attached to the cached btree node.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/debug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 4fa70634c90e..04db3e0ed82b 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -492,6 +492,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
 	prt_printf(out, "journal pin %px:\t%llu\n",
 		   &b->writes[1].journal, b->writes[1].journal.seq);
 
+	prt_printf(out, "ob:\t%u\n", b->ob.nr);
+
 	printbuf_indent_sub(out, 2);
 }
 

From 66b7c51ceb9f08e7dfb6b25e811b5b791100eda8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 20:51:00 -0400
Subject: [PATCH 27/69] bcachefs: bch2_check_fix_ptrs() can now repair btree
 roots

This is straightforward enough: check_fix_ptrs() currently only runs
before we go RW, so updating the btree root pointer in c->btree_roots
suffices - it'll be written out in the first journal write we do.

For that, do_bch2_trans_commit_to_journal_replay() now handles
JSET_ENTRY_btree_root entries.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c | 21 +++++++++---
 fs/bcachefs/buckets.c            | 55 +++++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 1c03c965d836..f2d1edc9163c 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -966,14 +966,27 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
 
 	for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
 	     i != btree_trans_journal_entries_top(trans);
-	     i = vstruct_next(i))
+	     i = vstruct_next(i)) {
 		if (i->type == BCH_JSET_ENTRY_btree_keys ||
 		    i->type == BCH_JSET_ENTRY_write_buffer_keys) {
-			int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
-			if (ret)
-				return ret;
+			jset_entry_for_each_key(i, k) {
+				int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
+				if (ret)
+					return ret;
+			}
 		}
 
+		if (i->type == BCH_JSET_ENTRY_btree_root) {
+			guard(mutex)(&c->btree_root_lock);
+
+			struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
+
+			bkey_copy(&r->key, i->start);
+			r->level = i->level;
+			r->alive = true;
+		}
+	}
+
 	for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
 	     i != btree_trans_subbuf_top(trans, &trans->accounting);
 	     i = bkey_next(i)) {
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b9e92bd55e1c..1adf6792ef97 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -270,6 +270,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
+	/* We don't yet do btree key updates correctly for when we're RW */
+	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
 	bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
 		ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
 		if (ret)
@@ -277,12 +280,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
 	}
 
 	if (do_update) {
-		if (flags & BTREE_TRIGGER_is_root) {
-			bch_err(c, "cannot update btree roots yet");
-			ret = -EINVAL;
-			goto err;
-		}
-
 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
 		ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
@@ -370,19 +367,41 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
 			bch_info(c, "new key %s", buf.buf);
 		}
 
-		struct btree_iter iter;
-		bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
-					  BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-		ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-			bch2_trans_update(trans, &iter, new,
-					  BTREE_UPDATE_internal_snapshot_node|
-					  BTREE_TRIGGER_norun);
-		bch2_trans_iter_exit(trans, &iter);
-		if (ret)
-			goto err;
+		if (!(flags & BTREE_TRIGGER_is_root)) {
+			struct btree_iter iter;
+			bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+						  BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+			ret =   bch2_btree_iter_traverse(trans, &iter) ?:
+				bch2_trans_update(trans, &iter, new,
+						  BTREE_UPDATE_internal_snapshot_node|
+						  BTREE_TRIGGER_norun);
+			bch2_trans_iter_exit(trans, &iter);
+			if (ret)
+				goto err;
 
-		if (level)
-			bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+			if (level)
+				bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+		} else {
+			struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
+					       jset_u64s(new->k.u64s));
+			ret = PTR_ERR_OR_ZERO(e);
+			if (ret)
+				goto err;
+
+			journal_entry_set(e,
+					  BCH_JSET_ENTRY_btree_root,
+					  btree, level - 1,
+					  new, new->k.u64s);
+
+			/*
+			 * no locking, we're single threaded and not rw yet, see
+			 * the big assertino above that we repeat here:
+			 */
+			BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
+			struct btree *b = bch2_btree_id_root(c, btree)->b;
+			bkey_copy(&b->key, new);
+		}
 	}
 err:
 	printbuf_exit(&buf);

From f1dc067bc10ac95550869dbe25dc4db1420f4f6a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 01:00:34 -0400
Subject: [PATCH 28/69] bcachefs: sysfs/errors

Make the superblock error counters available in sysfs; the only other
way they can be seen is 'show-super', but we don't write the superblock
every time the error count gets incremented.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors.c | 22 ++++++++++++++++++++++
 fs/bcachefs/sb-errors.h |  1 +
 fs/bcachefs/sysfs.c     |  6 ++++++
 3 files changed, 29 insertions(+)

diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 013a96883b4e..48853efdc105 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = {
 	.to_text	= bch2_sb_errors_to_text,
 };
 
+void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	if (out->nr_tabstops < 1)
+		printbuf_tabstop_push(out, 48);
+	if (out->nr_tabstops < 2)
+		printbuf_tabstop_push(out, 8);
+	if (out->nr_tabstops < 3)
+		printbuf_tabstop_push(out, 16);
+
+	guard(mutex)(&c->fsck_error_counts_lock);
+
+	bch_sb_errors_cpu *e = &c->fsck_error_counts;
+	darray_for_each(*e, i) {
+		bch2_sb_error_id_to_text(out, i->id);
+		prt_tab(out);
+		prt_u64(out, i->nr);
+		prt_tab(out);
+		bch2_prt_datetime(out, i->last_error_time);
+		prt_newline(out);
+	}
+}
+
 void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
 {
 	bch_sb_errors_cpu *e = &c->fsck_error_counts;
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index b2357b8e6107..e86267264692 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -7,6 +7,7 @@
 extern const char * const bch2_sb_error_strs[];
 
 void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1a55196d69f1..f93af1c45ae6 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -37,6 +37,7 @@
 #include "rebalance.h"
 #include "recovery_passes.h"
 #include "replicas.h"
+#include "sb-errors.h"
 #include "super-io.h"
 #include "tests.h"
 
@@ -172,6 +173,7 @@ read_attribute(btree_write_stats);
 
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
+read_attribute(errors);
 read_attribute(journal_debug);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
@@ -353,6 +355,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_compression_stats)
 		bch2_compression_stats_to_text(out, c);
 
+	if (attr == &sysfs_errors)
+		bch2_fs_errors_to_text(out, c);
+
 	if (attr == &sysfs_new_stripes)
 		bch2_new_stripes_to_text(out, c);
 
@@ -483,6 +488,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_recovery_status,
 
 	&sysfs_compression_stats,
+	&sysfs_errors,
 
 #ifdef CONFIG_BCACHEFS_TESTS
 	&sysfs_perf_test,

From 66621f016d792157fc661fe2cce40ab128009d79 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 15:02:37 -0400
Subject: [PATCH 29/69] bcachefs: Add missing printbuf_reset() in
 bch2_check_dirent_inode_dirent()

We were accidentally including the contents from the previous
fsck_err().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/namei.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index ff7ba297d3ef..c57da4029d36 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -768,6 +768,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
 			ret = __bch2_fsck_write_inode(trans, target);
 		}
 	} else {
+		printbuf_reset(&buf);
 		bch2_bkey_val_to_text(&buf, c, d.s_c);
 		prt_newline(&buf);
 		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);

From dc43f6a70b9685acabfda210f7cbb0520e952510 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 11:27:59 -0400
Subject: [PATCH 30/69] bcachefs: Mark bch_errcode helpers
 __attribute__((const))

These don't access global memory or defer pointer arguments - this
enables CSE optimizations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.c | 4 +++-
 fs/bcachefs/errcode.h | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index 43557bebd0f8..c39cf304c681 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = {
 	NULL
 };
 
-static unsigned bch2_errcode_parents[] = {
+static const unsigned bch2_errcode_parents[] = {
 #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
 	BCH_ERRCODES()
 #undef x
 };
 
+__attribute__((const))
 const char *bch2_err_str(int err)
 {
 	const char *errstr;
@@ -36,6 +37,7 @@ const char *bch2_err_str(int err)
 	return errstr ?: "(Invalid error)";
 }
 
+__attribute__((const))
 bool __bch2_err_matches(int err, int class)
 {
 	err	= abs(err);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 62843e772b2c..6b0791e1e64d 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -357,9 +357,11 @@ enum bch_errcode {
 	BCH_ERR_MAX
 };
 
-const char *bch2_err_str(int);
-bool __bch2_err_matches(int, int);
+__attribute__((const)) const char *bch2_err_str(int);
 
+__attribute__((const)) bool __bch2_err_matches(int, int);
+
+__attribute__((const))
 static inline bool _bch2_err_matches(int err, int class)
 {
 	return err < 0 && __bch2_err_matches(err, class);

From 642c1aabb001ee8410e6bfb6654b23bcead64e4e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 11:31:51 -0400
Subject: [PATCH 31/69] bcachefs: Use bch2_err_matches() for
 BCH_ERR_fsck_(fix|ignore)

We'll be adding subtypes of these errors, and new error code tracing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  4 ++--
 fs/bcachefs/btree_io.c         | 10 ++++-----
 fs/bcachefs/error.c            | 41 ++++++++++++++++++++--------------
 fs/bcachefs/error.h            | 10 ++++-----
 4 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 271d4a2eb15f..f284b4a2b535 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -709,8 +709,8 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans,
 				  set ? "" : "un",
 				  bch2_btree_id_str(btree),
 				  buf.buf);
-	if (ret == -BCH_ERR_fsck_ignore ||
-	    ret == -BCH_ERR_fsck_errors_not_fixed)
+	if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
+	    bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
 		ret = 0;
 
 	printbuf_exit(&buf);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34018296053a..c19a4b44162a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -602,8 +602,8 @@ static int __btree_err(int ret,
 		switch (ret) {
 		case -BCH_ERR_btree_node_read_err_fixable:
 			ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
-			if (ret2 != -BCH_ERR_fsck_fix &&
-			    ret2 != -BCH_ERR_fsck_ignore) {
+			if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+			    !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
 				ret = ret2;
 				goto fsck_err;
 			}
@@ -631,8 +631,8 @@ static int __btree_err(int ret,
 	switch (ret) {
 	case -BCH_ERR_btree_node_read_err_fixable:
 		ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
-		if (ret2 != -BCH_ERR_fsck_fix &&
-		    ret2 != -BCH_ERR_fsck_ignore) {
+		if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+		    !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
 			ret = ret2;
 			goto fsck_err;
 		}
@@ -660,7 +660,7 @@ static int __btree_err(int ret,
 			       failed, err_msg,				\
 			       msg, ##__VA_ARGS__);			\
 									\
-	if (_ret != -BCH_ERR_fsck_fix) {				\
+	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) {		\
 		ret = _ret;						\
 		goto fsck_err;						\
 	}								\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index c2cad28635bf..0488f966e1e4 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -444,7 +444,7 @@ int __bch2_fsck_err(struct bch_fs *c,
 {
 	va_list args;
 	struct printbuf buf = PRINTBUF, *out = &buf;
-	int ret = -BCH_ERR_fsck_ignore;
+	int ret = 0;
 	const char *action_orig = "fix?", *action = action_orig;
 
 	might_sleep();
@@ -573,19 +573,26 @@ int __bch2_fsck_err(struct bch_fs *c,
 		} else {
 			prt_str(out, ", not ");
 			prt_actioning(out, action);
+			ret = -BCH_ERR_fsck_ignore;
+		}
+	} else {
+		if (flags & FSCK_CAN_IGNORE) {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		} else {
+			prt_str(out, " (repair unimplemented)");
+			ret = -BCH_ERR_fsck_repair_unimplemented;
 		}
-	} else if (!(flags & FSCK_CAN_IGNORE)) {
-		prt_str(out, " (repair unimplemented)");
 	}
 
-	if (ret == -BCH_ERR_fsck_ignore &&
+	if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
 	    (c->opts.fix_errors == FSCK_FIX_exit ||
 	     !(flags & FSCK_CAN_IGNORE)))
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 
 	if (test_bit(BCH_FS_in_fsck, &c->flags) &&
-	    (ret != -BCH_ERR_fsck_fix &&
-	     ret != -BCH_ERR_fsck_ignore)) {
+	    (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
+	     !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
 		exiting = true;
 		print = true;
 	}
@@ -613,26 +620,26 @@ int __bch2_fsck_err(struct bch_fs *c,
 
 	if (s)
 		s->ret = ret;
-
+err_unlock:
+	mutex_unlock(&c->fsck_error_msgs_lock);
+err:
 	/*
 	 * We don't yet track whether the filesystem currently has errors, for
 	 * log_fsck_err()s: that would require us to track for every error type
 	 * which recovery pass corrects it, to get the fsck exit status correct:
 	 */
-	if (flags & FSCK_CAN_FIX) {
-		if (ret == -BCH_ERR_fsck_fix) {
-			set_bit(BCH_FS_errors_fixed, &c->flags);
-		} else {
-			set_bit(BCH_FS_errors_not_fixed, &c->flags);
-			set_bit(BCH_FS_error, &c->flags);
-		}
+	if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
+		set_bit(BCH_FS_errors_fixed, &c->flags);
+	} else {
+		set_bit(BCH_FS_errors_not_fixed, &c->flags);
+		set_bit(BCH_FS_error, &c->flags);
 	}
-err_unlock:
-	mutex_unlock(&c->fsck_error_msgs_lock);
-err:
+
 	if (action != action_orig)
 		kfree(action);
 	printbuf_exit(&buf);
+
+	BUG_ON(!ret);
 	return ret;
 }
 
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 5123d4c86770..4babb0d12bb2 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -105,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *);
 #define fsck_err_wrap(_do)						\
 ({									\
 	int _ret = _do;							\
-	if (_ret != -BCH_ERR_fsck_fix &&				\
-	    _ret != -BCH_ERR_fsck_ignore) {				\
+	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&		\
+	    !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) {		\
 		ret = _ret;						\
 		goto fsck_err;						\
 	}								\
 									\
-	_ret == -BCH_ERR_fsck_fix;					\
+	bch2_err_matches(_ret, BCH_ERR_fsck_fix);			\
 })
 
 #define __fsck_err(...)		fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
@@ -170,8 +170,8 @@ do {									\
 	int _ret = __bch2_bkey_fsck_err(c, k, from,			\
 				BCH_FSCK_ERR_##_err_type,		\
 				_err_msg, ##__VA_ARGS__);		\
-	if (_ret != -BCH_ERR_fsck_fix &&				\
-	    _ret != -BCH_ERR_fsck_ignore)				\
+	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&		\
+	    !bch2_err_matches(_ret, BCH_ERR_fsck_ignore))		\
 		ret = _ret;						\
 	ret = -BCH_ERR_fsck_delete_bkey;				\
 	goto fsck_err;							\

From f02d15327455822ed80e0b7d70b2ab3568a0389e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 16:06:07 -0400
Subject: [PATCH 32/69] bcachefs: Don't unlock trans before data_update_init()

data_update_init() does need to do btree operations, delay doing the
unlock-before-io.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c |  2 ++
 fs/bcachefs/move.c        | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 1216729c20e8..63a10ea83c97 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -980,6 +980,8 @@ int bch2_data_update_init(struct btree_trans *trans,
 		goto out_nocow_unlock;
 	}
 
+	bch2_trans_unlock(trans);
+
 	ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
 	if (ret)
 		goto out_nocow_unlock;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 857519cdac4e..fc1a7a04cb15 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -359,16 +359,14 @@ int bch2_move_extent(struct moving_context *ctxt,
 		return 0;
 	}
 
-	/*
-	 * Before memory allocations & taking nocow locks in
-	 * bch2_data_update_init():
-	 */
-	bch2_trans_unlock(trans);
-
-	struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
+	struct moving_io *io = allocate_dropping_locks(trans, ret,
+				kzalloc(sizeof(struct moving_io), _gfp));
 	if (!io)
 		goto err;
 
+	if (ret)
+		goto err_free;
+
 	INIT_LIST_HEAD(&io->io_list);
 	io->write.ctxt		= ctxt;
 	io->read_sectors	= k.k->size;
@@ -388,6 +386,8 @@ int bch2_move_extent(struct moving_context *ctxt,
 		io->write.op.c		= c;
 		io->write.data_opts	= data_opts;
 
+		bch2_trans_unlock(trans);
+
 		ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
 		if (ret)
 			goto err_free;

From 0224d17d762ce036fde5ad18dd33236db6fca88b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 27 May 2025 22:20:27 -0400
Subject: [PATCH 33/69] bcachefs: Runtime self healing for keys for deleted
 snapshots

If snapshot deletion incorrectly missing some keys and leaves keys for
deleted snapshots, that causes a bit of a problem for data move - we
can't move an extent for a nonexistent snapshot, because the extent
might have to be fragmented, and maintaining correct visibility in child
snapshots doesn't work if it doesn't have a snapshot.

Previously we'd just skip these keys, but it turns out that causes
copygc to spin.

So we need runtime self healing, i.e. calling check_key_has_snapshot()
from the data move path.

Snapshot deletion v2 included sentinal values for deleted snapshot
nodes, so this is quite safe.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 63a10ea83c97..fafe7a57ea41 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -821,13 +821,24 @@ int bch2_data_update_init(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	int ret = 0;
 
-	/*
-	 * fs is corrupt  we have a key for a snapshot node that doesn't exist,
-	 * and we have to check for this because we go rw before repairing the
-	 * snapshots table - just skip it, we can move it later.
-	 */
-	if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
-		return -BCH_ERR_data_update_done_no_snapshot;
+	if (k.k->p.snapshot) {
+		/*
+		 * We'll go ERO if we see a key for a missing snapshot, and if
+		 * we're still in recovery we want to give that a chance to
+		 * repair:
+		 */
+		if (unlikely(test_bit(BCH_FS_in_recovery, &c->flags) &&
+			     bch2_snapshot_id_state(c, k.k->p.snapshot) == SNAPSHOT_ID_empty))
+			return -BCH_ERR_data_update_done_no_snapshot;
+
+		ret = bch2_check_key_has_snapshot(trans, iter, k);
+		if (ret < 0)
+			return ret;
+		if (ret) /* key was deleted */
+			return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+				-BCH_ERR_data_update_done_no_snapshot;
+		ret = 0;
+	}
 
 	bch2_bkey_buf_init(&m->k);
 	bch2_bkey_buf_reassemble(&m->k, c, k);

From d21262d4e35d448cbc80092c91f04cc0a5f2b0b4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 14:26:33 -0400
Subject: [PATCH 34/69] bcachefs: bch2_dev_journal_bucket_delete()

Recover from "journal and btree in same bucket".

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c  | 14 ++++++++++
 fs/bcachefs/journal.c  | 60 ++++++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/journal.h  |  5 ++--
 fs/bcachefs/str_hash.c |  8 +++---
 4 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1adf6792ef97..410e0116917f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -221,6 +221,20 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		if (!p.ptr.cached &&
 		    data_type == BCH_DATA_btree) {
+			switch (g->data_type) {
+			case BCH_DATA_sb:
+				bch_err(c, "btree and superblock in the same bucket - cannot repair");
+				ret = -BCH_ERR_fsck_repair_unimplemented;
+				goto out;
+			case BCH_DATA_journal:
+				ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
+				bch_err_msg(c, ret, "error deleting journal bucket %zu",
+					    PTR_BUCKET_NR(ca, &p.ptr));
+				if (ret)
+					goto out;
+				break;
+			}
+
 			g->data_type		= data_type;
 			g->stripe_sectors	= 0;
 			g->dirty_sectors	= 0;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 09b70fd140a1..fd7f9ff33da0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1304,6 +1304,66 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	return ret;
 }
 
+int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal *j = &c->journal;
+	struct journal_device *ja = &ca->journal;
+
+	guard(mutex)(&c->sb_lock);
+	unsigned pos;
+	for (pos = 0; pos < ja->nr; pos++)
+		if (ja->buckets[pos] == b)
+			break;
+
+	if (pos == ja->nr) {
+		bch_err(ca, "journal bucket %llu not found when deleting", b);
+		return -EINVAL;
+	}
+
+	u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
+	if (!new_buckets)
+		return -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+
+	memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+	memmove(&new_buckets[pos],
+		&new_buckets[pos + 1],
+		(ja->nr - 1 - pos) * sizeof(new_buckets[0]));
+
+	int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
+		bch2_write_super(c);
+	if (ret) {
+		kfree(new_buckets);
+		return ret;
+	}
+
+	scoped_guard(spinlock, &j->lock) {
+		if (pos < ja->discard_idx)
+			--ja->discard_idx;
+		if (pos < ja->dirty_idx_ondisk)
+			--ja->dirty_idx_ondisk;
+		if (pos < ja->dirty_idx)
+			--ja->dirty_idx;
+		if (pos < ja->cur_idx)
+			--ja->cur_idx;
+
+		ja->nr--;
+
+		memmove(&ja->buckets[pos],
+			&ja->buckets[pos + 1],
+			(ja->nr - pos) * sizeof(ja->buckets[0]));
+
+		memmove(&ja->bucket_seq[pos],
+			&ja->bucket_seq[pos + 1],
+			(ja->nr - pos) * sizeof(ja->bucket_seq[0]));
+
+		bch2_journal_space_available(j);
+	}
+
+	kfree(new_buckets);
+	return 0;
+}
+
 int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
 {
 	struct bch_fs *c = ca->fs;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 8ff00a0ec778..83734fe4331f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -444,8 +444,9 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
-				unsigned nr);
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
+
 int bch2_dev_journal_alloc(struct bch_dev *, bool);
 int bch2_fs_journal_alloc(struct bch_fs *);
 
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 0cbf5508a32c..bfd4346a4d93 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -239,7 +239,8 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 
 	for_each_btree_key_norestart(trans, iter, desc->btree_id,
 				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-				     BTREE_ITER_slots, k, ret) {
+				     BTREE_ITER_slots|
+				     BTREE_ITER_with_updates, k, ret) {
 		if (bkey_eq(k.k->p, hash_k.k->p))
 			break;
 
@@ -286,10 +287,11 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 			goto duplicate_entries;
 
 		ret =   bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+					    BTREE_ITER_with_updates|
 					    BTREE_UPDATE_internal_snapshot_node) ?:
 			bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-			-BCH_ERR_transaction_restart_nested;
+			-BCH_ERR_transaction_restart_commit;
 		goto out;
 	}
 fsck_err:
@@ -323,6 +325,6 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 	}
 
 	ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-		-BCH_ERR_transaction_restart_nested;
+		-BCH_ERR_transaction_restart_commit;
 	goto out;
 }

From 801cb2bd6cb7ef2f4568b6646f92234c225c0932 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 15:08:19 -0400
Subject: [PATCH 35/69] bcachefs: bch2_get_snapshot_overwrites()

New helper for getting a list of snapshot IDs that have overwritten a
given key.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 13 ++++++++++++-
 fs/bcachefs/snapshot.c     | 29 +++++++++++++++++++++++++++++
 fs/bcachefs/snapshot.h     | 19 +++++++++++++++++++
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 5dac09c98026..48af3d2e3da9 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -180,8 +180,19 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	}
 	bch2_trans_iter_exit(trans, &new_iter);
 	bch2_trans_iter_exit(trans, &old_iter);
-	darray_exit(&s);
 
+	snapshot_id_list s2;
+	ret = bch2_get_snapshot_overwrites(trans, id, old_pos, &s2);
+	if (ret) {
+		darray_exit(&s);
+		return ret;
+	}
+
+	BUG_ON(s.nr != s2.nr);
+	BUG_ON(memcmp(s.data, s2.data, sizeof(s.data[0]) * s.nr));
+
+	darray_exit(&s2);
+	darray_exit(&s);
 	return ret;
 }
 
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 00d62d1190ef..f553fe095f61 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1079,6 +1079,35 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans,
 	return ret;
 }
 
+int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
+				   enum btree_id btree, struct bpos pos,
+				   snapshot_id_list *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
+					     BTREE_ITER_all_snapshots, k, ret) {
+		if (!bkey_eq(k.k->p, pos))
+			break;
+
+		if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
+		    snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
+			continue;
+
+		ret = snapshot_list_add(c, s, k.k->p.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	if (ret)
+		darray_exit(s);
+
+	return ret;
+}
+
 /*
  * Mark a snapshot as deleted, for future cleanup:
  */
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 382a171f5413..be7b71c06621 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -258,6 +258,25 @@ static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
 		: __bch2_check_key_has_snapshot(trans, iter, k);
 }
 
+int __bch2_get_snapshot_overwrites(struct btree_trans *,
+				   enum btree_id, struct bpos,
+				   snapshot_id_list *);
+
+/*
+ * Get a list of snapshot IDs that have overwritten a given key:
+ */
+static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
+					       enum btree_id btree, struct bpos pos,
+					       snapshot_id_list *s)
+{
+	darray_init(s);
+
+	return bch2_snapshot_has_children(trans->c, pos.snapshot)
+		? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
+		: 0;
+
+}
+
 int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
 
 int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);

From cb6f5d0decea51225b297b0ad3c393de12d68bf0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 15:20:20 -0400
Subject: [PATCH 36/69] bcachefs: __bch2_insert_snapshot_whiteouts()
 refactoring

Now uses bch2_get_snapshot_overwrites(), and much shorter.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update.c | 68 ++++++++++----------------------------
 fs/bcachefs/btree_update.h | 14 ++++++--
 2 files changed, 30 insertions(+), 52 deletions(-)

diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 48af3d2e3da9..e04508da5f7b 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -123,76 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
 }
 
 int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-				   enum btree_id id,
-				   struct bpos old_pos,
-				   struct bpos new_pos)
+				     enum btree_id btree, struct bpos pos,
+				     snapshot_id_list *s)
 {
-	struct bch_fs *c = trans->c;
-	struct btree_iter old_iter, new_iter = {};
-	struct bkey_s_c old_k, new_k;
-	snapshot_id_list s;
-	struct bkey_i *update;
 	int ret = 0;
 
-	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-		return 0;
+	darray_for_each(*s, id) {
+		pos.snapshot = *id;
 
-	darray_init(&s);
-
-	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
-			     BTREE_ITER_not_extents|
-			     BTREE_ITER_all_snapshots);
-	while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k &&
-	       !(ret = bkey_err(old_k)) &&
-	       bkey_eq(old_pos, old_k.k->p)) {
-		struct bpos whiteout_pos =
-			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
-
-		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
-		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
-			continue;
-
-		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
-					   BTREE_ITER_not_extents|
-					   BTREE_ITER_intent);
-		ret = bkey_err(new_k);
+		struct btree_iter iter;
+		struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
+						       BTREE_ITER_not_extents|
+						       BTREE_ITER_intent);
+		ret = bkey_err(k);
 		if (ret)
 			break;
 
-		if (new_k.k->type == KEY_TYPE_deleted) {
-			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+		if (k.k->type == KEY_TYPE_deleted) {
+			struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
 			ret = PTR_ERR_OR_ZERO(update);
-			if (ret)
+			if (ret) {
+				bch2_trans_iter_exit(trans, &iter);
 				break;
+			}
 
 			bkey_init(&update->k);
-			update->k.p		= whiteout_pos;
+			update->k.p		= pos;
 			update->k.type		= KEY_TYPE_whiteout;
 
-			ret = bch2_trans_update(trans, &new_iter, update,
+			ret = bch2_trans_update(trans, &iter, update,
 						BTREE_UPDATE_internal_snapshot_node);
 		}
-		bch2_trans_iter_exit(trans, &new_iter);
+		bch2_trans_iter_exit(trans, &iter);
 
-		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
 		if (ret)
 			break;
 	}
-	bch2_trans_iter_exit(trans, &new_iter);
-	bch2_trans_iter_exit(trans, &old_iter);
 
-	snapshot_id_list s2;
-	ret = bch2_get_snapshot_overwrites(trans, id, old_pos, &s2);
-	if (ret) {
-		darray_exit(&s);
-		return ret;
-	}
-
-	BUG_ON(s.nr != s2.nr);
-	BUG_ON(memcmp(s.data, s2.data, sizeof(s.data[0]) * s.nr));
-
-	darray_exit(&s2);
-	darray_exit(&s);
+	darray_exit(s);
 	return ret;
 }
 
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index f907eaa8b185..9feef1dc4de5 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -4,6 +4,7 @@
 
 #include "btree_iter.h"
 #include "journal.h"
+#include "snapshot.h"
 
 struct bch_fs;
 struct btree;
@@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
 }
 
 int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
-				     struct bpos, struct bpos);
+				     struct bpos, snapshot_id_list *);
 
 /*
  * For use when splitting extents in existing snapshots:
@@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 						 struct bpos old_pos,
 						 struct bpos new_pos)
 {
+	BUG_ON(old_pos.snapshot != new_pos.snapshot);
+
 	if (!btree_type_has_snapshots(btree) ||
 	    bkey_eq(old_pos, new_pos))
 		return 0;
 
-	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+	snapshot_id_list s;
+	int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
+	if (ret)
+		return ret;
+
+	return s.nr
+		? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
+		: 0;
 }
 
 int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,

From a5922682602788a0f9b37d58e15d7247ad6c54d4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 16:25:11 -0400
Subject: [PATCH 37/69] bcachefs: bch2_str_hash_check_key() may now be called
 without snapshots_seen

We don't track snapshot overwrites outside of fsck, so for this to be
called at runtime outside of fsck we need to create it on demand, when
we have repair to do.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     |  5 -----
 fs/bcachefs/fsck.h     |  6 ++++++
 fs/bcachefs/str_hash.c | 23 ++++++++++++++++++++++-
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8e07a365b24c..950fa9685d3e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -643,11 +643,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
 	return __bch2_fsck_write_inode(trans, &new_inode);
 }
 
-struct snapshots_seen {
-	struct bpos			pos;
-	snapshot_id_list		ids;
-};
-
 static inline void snapshots_seen_exit(struct snapshots_seen *s)
 {
 	darray_exit(&s->ids);
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 574948278cd4..e5fe7cf7b251 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -4,6 +4,12 @@
 
 #include "str_hash.h"
 
+/* recoverds snapshot IDs of overwrites at @pos */
+struct snapshots_seen {
+	struct bpos			pos;
+	snapshot_id_list		ids;
+};
+
 int bch2_fsck_update_backpointers(struct btree_trans *,
 				  struct snapshots_seen *,
 				  const struct bch_hash_desc,
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index bfd4346a4d93..f101ca8581d9 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -231,6 +231,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 	struct btree_iter iter = {};
 	struct printbuf buf = PRINTBUF;
 	struct bkey_s_c k;
+	bool free_snapshots_seen = false;
 	int ret = 0;
 
 	u64 hash = desc->hash_bkey(hash_info, hash_k);
@@ -256,6 +257,8 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 out:
 	bch2_trans_iter_exit(trans, &iter);
 	printbuf_exit(&buf);
+	if (free_snapshots_seen)
+		darray_exit(&s->ids);
 	return ret;
 bad_hash:
 	/*
@@ -265,6 +268,22 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
+	if (!s) {
+		s = bch2_trans_kmalloc(trans, sizeof(*s));
+		ret = PTR_ERR_OR_ZERO(s);
+		if (ret)
+			goto out;
+
+		s->pos = k_iter->pos;
+		darray_init(&s->ids);
+
+		ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
+		if (ret)
+			goto out;
+
+		free_snapshots_seen = true;
+	}
+
 	if (fsck_err(trans, hash_table_key_wrong_offset,
 		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
 		     bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
@@ -286,7 +305,9 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 		if (k.k)
 			goto duplicate_entries;
 
-		ret =   bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+		ret =   bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
+						       k_iter->pos, new->k.p) ?:
+			bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
 					    BTREE_ITER_with_updates|
 					    BTREE_UPDATE_internal_snapshot_node) ?:
 			bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:

From f402d9710b3e55fa2e47a939f69e4267d6d4406f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 16:34:42 -0400
Subject: [PATCH 38/69] bcachefs: bch2_readdir() now calls str_hash_check_key()

More self healing code: readdir will now notice if there are dirents
hashed incorrectly, and it'll repair them if errors=fix_safe.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 9 +++++++--
 fs/bcachefs/dirent.h | 2 +-
 fs/bcachefs/fs.c     | 3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d198001838f3..37d7cf69ae1d 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -692,7 +692,9 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
 	return !ret;
 }
 
-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum,
+		 struct bch_hash_info *hash_info,
+		 struct dir_context *ctx)
 {
 	struct bkey_buf sk;
 	bch2_bkey_buf_init(&sk);
@@ -710,7 +712,10 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 			struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
 
 			subvol_inum target;
-			int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+
+			int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
+							   hash_info, &iter, k) ?:
+				bch2_dirent_read_target(trans, inum, dirent, &target);
 			if (ret2 > 0)
 				continue;
 
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index d3e7ae669575..1f600dedafe1 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -95,7 +95,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
 
 int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
 int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
 
 int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index adae43223bce..f52c7db16dec 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1573,11 +1573,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	int ret = bch2_readdir(c, inode_inum(inode), ctx);
+	int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
 
 	bch_err_fn(c, ret);
 	return bch2_err_class(ret);

From 6447544c3d1473c9d8945e2cc0f3c71eba4c354b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 17:32:35 -0400
Subject: [PATCH 39/69] bcachefs: Improve error printing in
 btree_node_check_topology()

We had a bug report where the errors from btree_node_check_topology()
don't seem to be getting printed; log_fsck_err() does some fancy
ratelimiting-type stuff that we don't want here.

Instead, just use bch2_count_fsck_err(); this is simpler, and modelled
after how we're currently handling bucket ref update errors in
buckets.c.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 71 ++++++++++++++---------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 74e65714fecd..647b40efd27f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -57,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	struct bkey_buf prev;
 	int ret = 0;
 
-	printbuf_indent_add_nextline(&buf, 2);
-
 	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
 			b->data->min_key));
@@ -69,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 
 	if (b == btree_node_root(c, b)) {
 		if (!bpos_eq(b->data->min_key, POS_MIN)) {
-			ret = __bch2_topology_error(c, &buf);
-
+			bch2_log_msg_start(c, &buf);
+			prt_printf(&buf, "btree root with incorrect min_key: ");
 			bch2_bpos_to_text(&buf, b->data->min_key);
-			log_fsck_err(trans, btree_root_bad_min_key,
-				      "btree root with incorrect min_key: %s", buf.buf);
-			goto out;
+			prt_newline(&buf);
+
+			bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
+			goto err;
 		}
 
 		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
-			ret = __bch2_topology_error(c, &buf);
+			bch2_log_msg_start(c, &buf);
+			prt_printf(&buf, "btree root with incorrect max_key: ");
 			bch2_bpos_to_text(&buf, b->data->max_key);
-			log_fsck_err(trans, btree_root_bad_max_key,
-				      "btree root with incorrect max_key: %s", buf.buf);
-			goto out;
+			prt_newline(&buf);
+
+			bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
+			goto err;
 		}
 	}
 
@@ -100,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 			: bpos_successor(prev.k->k.p);
 
 		if (!bpos_eq(expected_min, bp.v->min_key)) {
-			ret = __bch2_topology_error(c, &buf);
-
-			prt_str(&buf, "end of prev node doesn't match start of next node\nin ");
-			bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-			prt_str(&buf, " node ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			prt_str(&buf, "end of prev node doesn't match start of next node");
 			prt_str(&buf, "\nprev ");
 			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
 			prt_str(&buf, "\nnext ");
 			bch2_bkey_val_to_text(&buf, c, k);
+			prt_newline(&buf);
 
-			log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
-			goto out;
+			bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
+			goto err;
 		}
 
 		bch2_bkey_buf_reassemble(&prev, c, k);
@@ -120,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 	}
 
 	if (bkey_deleted(&prev.k->k)) {
-		ret = __bch2_topology_error(c, &buf);
+		prt_printf(&buf, "empty interior node\n");
+		bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
+		goto err;
+	}
 
-		prt_str(&buf, "empty interior node\nin ");
-		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-		prt_str(&buf, " node ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-		log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
-	} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
-		ret = __bch2_topology_error(c, &buf);
-
-		prt_str(&buf, "last child node doesn't end at end of parent node\nin ");
-		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-		prt_str(&buf, " node ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		prt_str(&buf, "\nlast key ");
+	if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+		prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+		prt_newline(&buf);
 
-		log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+		bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
+		goto err;
 	}
 out:
-fsck_err:
 	bch2_btree_and_journal_iter_exit(&iter);
 	bch2_bkey_buf_exit(&prev, c);
 	printbuf_exit(&buf);
 	return ret;
+err:
+	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+	prt_char(&buf, ' ');
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+	prt_newline(&buf);
+
+	ret = __bch2_topology_error(c, &buf);
+	bch2_print_str(c, KERN_ERR, buf.buf);
+	BUG_ON(!ret);
+	goto out;
 }
 
 /* Calculate ideal packed bkey format for new btree nodes: */

From 9a1accd3a57d4bfb6daeee2262b1b24b57ec2382 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 20:06:01 -0400
Subject: [PATCH 40/69] bcachefs: Journal keys are retained until shutdown, or
 journal replay finishes

If we don't finish journal replay we need to keep journal keys around
until the filesystem shuts down - otherwise e.g. -o norecovery, various
tools (dump, list) break, and eventually we'll be doing journal replay
in the background.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4fca57575565..4b51105bdb2e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1093,10 +1093,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 out:
 	bch2_flush_fsck_errs(c);
 
-	if (!c->opts.retain_recovery_info) {
-		bch2_journal_keys_put_initial(c);
-		bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-	}
 	if (!IS_ERR(clean))
 		kfree(clean);
 

From 5802caf74fa5647a0e560b585bf7d1ac65b20e11 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 16:56:50 -0400
Subject: [PATCH 41/69] bcachefs: darray_find(), darray_find_p()

New helpers to avoid open coded loops.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 24 +++++++++++-------------
 fs/bcachefs/darray.h           | 18 +++++++++++++++++-
 fs/bcachefs/fsck.c             | 11 ++++-------
 fs/bcachefs/snapshot.c         | 11 +++--------
 fs/bcachefs/snapshot.h         |  5 +----
 5 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index f284b4a2b535..2325a2699a89 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1792,11 +1792,12 @@ static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progres
 	int ret;
 
 	mutex_lock(&ca->discard_buckets_in_flight_lock);
-	darray_for_each(ca->discard_buckets_in_flight, i)
-		if (i->bucket == bucket) {
-			ret = -BCH_ERR_EEXIST_discard_in_flight_add;
-			goto out;
-		}
+	struct discard_in_flight *i =
+		darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+	if (i) {
+		ret = -BCH_ERR_EEXIST_discard_in_flight_add;
+		goto out;
+	}
 
 	ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
 			   .in_progress = in_progress,
@@ -1810,14 +1811,11 @@ static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progres
 static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
 {
 	mutex_lock(&ca->discard_buckets_in_flight_lock);
-	darray_for_each(ca->discard_buckets_in_flight, i)
-		if (i->bucket == bucket) {
-			BUG_ON(!i->in_progress);
-			darray_remove_item(&ca->discard_buckets_in_flight, i);
-			goto found;
-		}
-	BUG();
-found:
+	struct discard_in_flight *i =
+		darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+	BUG_ON(!i || !i->in_progress);
+
+	darray_remove_item(&ca->discard_buckets_in_flight, i);
 	mutex_unlock(&ca->discard_buckets_in_flight_lock);
 }
 
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 50ec3decfe8c..d08d39c1b93d 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -87,7 +87,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
 #define darray_remove_item(_d, _pos)					\
 	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
 
-#define __darray_for_each(_d, _i)						\
+#define darray_find_p(_d, _i, cond)					\
+({									\
+	typeof((_d).data) _ret = NULL;					\
+									\
+	darray_for_each(_d, _i)						\
+		if (cond) {						\
+			_ret = _i;					\
+			break;						\
+		}							\
+	_ret;								\
+})
+
+#define darray_find(_d, _item)	darray_find_p(_d, _i, *_i == _item)
+
+/* Iteration: */
+
+#define __darray_for_each(_d, _i)					\
 	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
 #define darray_for_each(_d, _i)						\
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 950fa9685d3e..631ee2af8585 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -885,14 +885,11 @@ lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, str
 {
 	struct bch_fs *c = trans->c;
 
-	struct inode_walker_entry *i;
-	__darray_for_each(w->inodes, i)
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
-			goto found;
+	struct inode_walker_entry *i = darray_find_p(w->inodes, i,
+		    bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
 
-	return NULL;
-found:
-	BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
+	if (!i)
+		return NULL;
 
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index f553fe095f61..cf9a65e858f6 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -947,10 +947,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo
 
 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
 {
-	darray_for_each(*l, i)
-		if (snapshot_list_has_id(r, *i))
-			return true;
-	return false;
+	return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
 }
 
 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
@@ -1428,10 +1425,8 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
 {
-	darray_for_each(*l, i)
-		if (i->id == id)
-			return i->live_child;
-	return 0;
+	struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
+	return i ? i->live_child : 0;
 }
 
 static unsigned __live_child(struct snapshot_table *t, u32 id,
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index be7b71c06621..ee79f81f175c 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -190,10 +190,7 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
 
 static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 {
-	darray_for_each(*s, i)
-		if (*i == id)
-			return true;
-	return false;
+	return darray_find(*s, id) != NULL;
 }
 
 static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)

From 1f42a0335a721eca962f792794e864797d09087a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 18:02:21 -0400
Subject: [PATCH 42/69] bcachefs: sysfs trigger_emergency_read_only

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f93af1c45ae6..10f4045301a3 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -26,6 +26,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "enumerated_ref.h"
+#include "error.h"
 #include "inode.h"
 #include "journal.h"
 #include "journal_reclaim.h"
@@ -152,6 +153,7 @@ write_attribute(trigger_btree_updates);
 write_attribute(trigger_freelist_wakeup);
 write_attribute(trigger_recalc_capacity);
 write_attribute(trigger_delete_dead_snapshots);
+write_attribute(trigger_emergency_read_only);
 read_attribute(gc_gens_pos);
 
 read_attribute(uuid);
@@ -453,6 +455,16 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_delete_dead_snapshots)
 		__bch2_delete_dead_snapshots(c);
 
+	if (attr == &sysfs_trigger_emergency_read_only) {
+		struct printbuf buf = PRINTBUF;
+		bch2_log_msg_start(c, &buf);
+
+		prt_printf(&buf, "shutdown by sysfs\n");
+		bch2_fs_emergency_read_only2(c, &buf);
+		bch2_print_str(c, KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+	}
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -585,6 +597,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_freelist_wakeup,
 	&sysfs_trigger_recalc_capacity,
 	&sysfs_trigger_delete_dead_snapshots,
+	&sysfs_trigger_emergency_read_only,
 
 	&sysfs_gc_gens_pos,
 

From a0f7437906d115c3fff1b6242f57ca87262a879b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 20:16:58 -0400
Subject: [PATCH 43/69] bcachefs: sysfs trigger_journal_commit

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 10f4045301a3..05848375cea2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -145,6 +145,7 @@ do {									\
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_commit);
 write_attribute(trigger_journal_flush);
 write_attribute(trigger_journal_writes);
 write_attribute(trigger_btree_cache_shrink);
@@ -435,6 +436,9 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_invalidates)
 		bch2_do_invalidates(c);
 
+	if (attr == &sysfs_trigger_journal_commit)
+		bch2_journal_flush(&c->journal);
+
 	if (attr == &sysfs_trigger_journal_flush) {
 		bch2_journal_flush_all_pins(&c->journal);
 		bch2_journal_meta(&c->journal);
@@ -589,6 +593,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_gc,
 	&sysfs_trigger_discards,
 	&sysfs_trigger_invalidates,
+	&sysfs_trigger_journal_commit,
 	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_journal_writes,
 	&sysfs_trigger_btree_cache_shrink,

From 237a8e16bd71cce84e0e3404e1ed8df2b5d63c7c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 May 2025 02:59:35 -0400
Subject: [PATCH 44/69] bcachefs: CLASS(printbuf)

Add a DEFINE_CLASS() for printbufs.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/printbuf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 1ca476adbf6f..8f4e28d440ac 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
 	.size	= _size,				\
 })
 
+static inline struct printbuf bch2_printbuf_init(void)
+{
+	return PRINTBUF;
+}
+
+DEFINE_CLASS(printbuf, struct printbuf,
+	     bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
+
 /*
  * Returns size remaining of output buffer:
  */

From 42359f1615cf0c8184de67408ab294b574cfaaf6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 29 May 2025 19:13:42 -0400
Subject: [PATCH 45/69] bcachefs: CLASS(darray)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/darray.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index d08d39c1b93d..4080ee99aadd 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -8,6 +8,7 @@
  * Inspired by CCAN's darray
  */
 
+#include <linux/cleanup.h>
 #include <linux/slab.h>
 
 #define DARRAY_PREALLOCATED(_type, _nr)					\
@@ -112,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
 #define darray_for_each_reverse(_d, _i)					\
 	for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
 
+/* Init/exit */
+
 #define darray_init(_d)							\
 do {									\
 	(_d)->nr = 0;							\
@@ -127,4 +130,29 @@ do {									\
 	darray_init(_d);						\
 } while (0)
 
+#define DEFINE_DARRAY_CLASS(_type)					\
+DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
+
+#define DEFINE_DARRAY(_type)						\
+typedef DARRAY(_type)	darray_##_type;					\
+DEFINE_DARRAY_CLASS(darray_##_type)
+
+#define DEFINE_DARRAY_NAMED(_name, _type)				\
+typedef DARRAY(_type)	_name;						\
+DEFINE_DARRAY_CLASS(_name)
+
+DEFINE_DARRAY_CLASS(darray_char);
+DEFINE_DARRAY_CLASS(darray_str)
+DEFINE_DARRAY_CLASS(darray_const_str)
+
+DEFINE_DARRAY_CLASS(darray_u8)
+DEFINE_DARRAY_CLASS(darray_u16)
+DEFINE_DARRAY_CLASS(darray_u32)
+DEFINE_DARRAY_CLASS(darray_u64)
+
+DEFINE_DARRAY_CLASS(darray_s8)
+DEFINE_DARRAY_CLASS(darray_s16)
+DEFINE_DARRAY_CLASS(darray_s32)
+DEFINE_DARRAY_CLASS(darray_s64)
+
 #endif /* _BCACHEFS_DARRAY_H */

From 9cb49fbf734609c79ba29c43d98e1230ecd1361a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 25 May 2025 01:41:17 -0400
Subject: [PATCH 46/69] bcachefs: CLASS(btree_trans)

Allow btree_trans to be used with CLASS().

Automatic cleanup, instead of manually calling bch2_trans_put().

We don't use DEFINE_CLASS because using a static inline for the
constructor breaks bch2_trans_get()'s use of __func__, so we have to
open code it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2cabb5f0f484..09dd3e52622e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -963,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
 	_p;								\
 })
 
-#define bch2_trans_run(_c, _do)						\
-({									\
-	struct btree_trans *trans = bch2_trans_get(_c);			\
-	int _ret = (_do);						\
-	bch2_trans_put(trans);						\
-	_ret;								\
-})
-
-#define bch2_trans_do(_c, _do)	bch2_trans_run(_c, lockrestart_do(trans, _do))
-
 struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
 void bch2_trans_put(struct btree_trans *);
 
@@ -990,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *);
 	__bch2_trans_get(_c, trans_fn_idx);				\
 })
 
+/*
+ * We don't use DEFINE_CLASS() because using a function for the constructor
+ * breaks bch2_trans_get()'s use of __func__
+ */
+typedef struct btree_trans * class_btree_trans_t;
+static inline void class_btree_trans_destructor(struct btree_trans **p)
+{
+	struct btree_trans *trans = *p;
+	bch2_trans_put(trans);
+}
+
+#define class_btree_trans_constructor(_c)	bch2_trans_get(_c)
+
+#define bch2_trans_run(_c, _do)						\
+({									\
+	CLASS(btree_trans, trans)(_c);					\
+	(_do);								\
+})
+
+#define bch2_trans_do(_c, _do)	bch2_trans_run(_c, lockrestart_do(trans, _do))
+
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
 void bch2_fs_btree_iter_exit(struct bch_fs *);

From 18dad454cd16cbb4c219dbd19a0008af52eb294a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 May 2025 16:33:39 -0400
Subject: [PATCH 47/69] bcachefs: Replace rcu_read_lock() with guards

The new guard(), scoped_guard() allow for more natural code.

Some of the uses with creative flow control have been left.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c   | 23 ++++--------
 fs/bcachefs/alloc_background.h   |  6 ++--
 fs/bcachefs/alloc_foreground.c   | 46 ++++++++++--------------
 fs/bcachefs/backpointers.c       | 57 +++++++++++++++--------------
 fs/bcachefs/backpointers.h       |  3 +-
 fs/bcachefs/btree_gc.c           | 49 +++++++++++++------------
 fs/bcachefs/btree_io.c           | 13 ++++---
 fs/bcachefs/btree_iter.c         | 12 +++----
 fs/bcachefs/btree_journal_iter.c | 13 +++----
 fs/bcachefs/btree_key_cache.c    | 16 ++++-----
 fs/bcachefs/btree_locking.c      |  3 +-
 fs/bcachefs/buckets.c            | 18 ++++------
 fs/bcachefs/buckets.h            | 12 +++----
 fs/bcachefs/chardev.c            |  7 ++--
 fs/bcachefs/data_update.c        | 27 +++++++-------
 fs/bcachefs/debug.c              | 28 +++++++--------
 fs/bcachefs/disk_accounting.c    | 28 +++++++--------
 fs/bcachefs/disk_accounting.h    |  6 ++--
 fs/bcachefs/disk_groups.c        | 35 ++++++------------
 fs/bcachefs/ec.c                 | 47 ++++++++++++------------
 fs/bcachefs/extents.c            | 51 ++++++++++----------------
 fs/bcachefs/fs-io.c              | 12 +++----
 fs/bcachefs/fs.c                 | 19 +++++-----
 fs/bcachefs/io_read.c            |  3 +-
 fs/bcachefs/io_write.c           |  7 ++--
 fs/bcachefs/journal.c            |  7 ++--
 fs/bcachefs/journal_io.c         |  3 +-
 fs/bcachefs/journal_reclaim.c    | 21 ++++-------
 fs/bcachefs/lru.c                |  6 ++--
 fs/bcachefs/move.c               |  6 ++--
 fs/bcachefs/movinggc.c           | 26 +++++++-------
 fs/bcachefs/movinggc.h           |  3 +-
 fs/bcachefs/rebalance.c          | 17 +++++----
 fs/bcachefs/replicas.c           | 23 ++++++------
 fs/bcachefs/sb-members.c         | 19 ++++------
 fs/bcachefs/sb-members.h         | 32 ++++++-----------
 fs/bcachefs/six.c                |  7 ++--
 fs/bcachefs/snapshot.c           | 46 ++++++++----------------
 fs/bcachefs/snapshot.h           | 61 +++++++++-----------------------
 fs/bcachefs/subvolume.c          |  8 ++---
 fs/bcachefs/super.c              | 47 ++++++++++--------------
 41 files changed, 346 insertions(+), 527 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2325a2699a89..e1231b20faec 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1000,14 +1000,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 		}
 
 		if (new_a->gen != old_a->gen) {
-			rcu_read_lock();
+			guard(rcu)();
 			u8 *gen = bucket_gen(ca, new.k->p.offset);
-			if (unlikely(!gen)) {
-				rcu_read_unlock();
+			if (unlikely(!gen))
 				goto invalid_bucket;
-			}
 			*gen = new_a->gen;
-			rcu_read_unlock();
 		}
 
 #define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
@@ -1033,15 +1030,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 	}
 
 	if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
-		rcu_read_lock();
+		guard(rcu)();
 		struct bucket *g = gc_bucket(ca, new.k->p.offset);
-		if (unlikely(!g)) {
-			rcu_read_unlock();
+		if (unlikely(!g))
 			goto invalid_bucket;
-		}
 		g->gen_valid	= 1;
 		g->gen		= new_a->gen;
-		rcu_read_unlock();
 	}
 err:
 fsck_err:
@@ -1117,13 +1111,12 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
 		bucket->offset = 0;
 	}
 
-	rcu_read_lock();
+	guard(rcu)();
 	*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
 	if (*ca) {
 		*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
 		bch2_dev_get(*ca);
 	}
-	rcu_read_unlock();
 
 	return *ca != NULL;
 }
@@ -2514,7 +2507,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
 	lockdep_assert_held(&c->state_lock);
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_member_device_rcu(c, ca, NULL) {
 		struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
 		if (bdev)
@@ -2559,7 +2552,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		bucket_size_max = max_t(unsigned, bucket_size_max,
 					ca->mi.bucket_size);
 	}
-	rcu_read_unlock();
 
 	bch2_set_ra_pages(c, ra_pages);
 
@@ -2584,10 +2576,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
 {
 	u64 ret = U64_MAX;
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_rw_member_rcu(c, ca)
 		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
-	rcu_read_unlock();
 	return ret;
 }
 
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index b97ae710219f..0cc5adc55b6f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,11 +13,9 @@
 
 static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
-	bool ret = ca && bucket_valid(ca, pos.offset);
-	rcu_read_unlock();
-	return ret;
+	return ca && bucket_valid(ca, pos.offset);
 }
 
 static inline u64 bucket_to_u64(struct bpos bucket)
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0e7eeb89299c..e157bc86b2f3 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = {
 
 void bch2_reset_alloc_cursors(struct bch_fs *c)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_member_device_rcu(c, ca, NULL)
 		memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
-	rcu_read_unlock();
 }
 
 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
@@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
 	       ARRAY_SIZE(c->open_buckets_partial));
 
 	spin_lock(&c->freelist_lock);
-	rcu_read_lock();
-	bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
-	rcu_read_unlock();
+	scoped_guard(rcu)
+		bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
 
 	ob->on_partial_list = true;
 	c->open_buckets_partial[c->open_buckets_partial_nr++] =
@@ -873,9 +871,8 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
 					  i);
 			ob->on_partial_list = false;
 
-			rcu_read_lock();
-			bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-			rcu_read_unlock();
+			scoped_guard(rcu)
+				bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
 
 			ret = add_new_bucket(c, req, ob);
 			if (ret)
@@ -1057,9 +1054,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
 
 			ob->on_partial_list = false;
 
-			rcu_read_lock();
-			bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-			rcu_read_unlock();
+			scoped_guard(rcu)
+				bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
 
 			spin_unlock(&c->freelist_lock);
 			bch2_open_bucket_put(c, ob);
@@ -1087,14 +1083,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 {
 	struct write_point *wp;
 
-	rcu_read_lock();
+	guard(rcu)();
 	hlist_for_each_entry_rcu(wp, head, node)
 		if (wp->write_point == write_point)
-			goto out;
-	wp = NULL;
-out:
-	rcu_read_unlock();
-	return wp;
+			return wp;
+	return NULL;
 }
 
 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
@@ -1638,19 +1631,16 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
 
 	bch2_printbuf_make_room(&buf, 4096);
 
-	rcu_read_lock();
 	buf.atomic++;
-
-	for_each_online_member_rcu(c, ca) {
-		prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
-		printbuf_indent_add(&buf, 2);
-		bch2_dev_alloc_debug_to_text(&buf, ca);
-		printbuf_indent_sub(&buf, 2);
-		prt_newline(&buf);
-	}
-
+	scoped_guard(rcu)
+		for_each_online_member_rcu(c, ca) {
+			prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+			printbuf_indent_add(&buf, 2);
+			bch2_dev_alloc_debug_to_text(&buf, ca);
+			printbuf_indent_sub(&buf, 2);
+			prt_newline(&buf);
+		}
 	--buf.atomic;
-	rcu_read_unlock();
 
 	prt_printf(&buf, "Copygc debug:\n");
 	printbuf_indent_add(&buf, 2);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index cde7dd115267..ebc8ee2cd33e 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -48,18 +48,20 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 {
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
-	if (ca) {
-		u32 bucket_offset;
-		struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
-		rcu_read_unlock();
-		prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
-	} else {
-		rcu_read_unlock();
-		prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
+	struct bch_dev *ca;
+	u32 bucket_offset;
+	struct bpos bucket;
+	scoped_guard(rcu) {
+		ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
+		if (ca)
+			bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
 	}
 
+	if (ca)
+		prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
+	else
+		prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
+
 	bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
 	prt_str(out, " data_type=");
 	bch2_prt_data_type(out, bp.v->data_type);
@@ -591,6 +593,7 @@ static int check_bp_exists(struct btree_trans *trans,
 		bkey_for_each_ptr(other_extent_ptrs, ptr)
 			if (ptr->dev == bp->k.p.inode &&
 			    dev_ptr_stale_rcu(ca, ptr)) {
+				rcu_read_unlock();
 				ret = drop_dev_and_update(trans, other_bp.v->btree_id,
 							  other_extent, bp->k.p.inode);
 				if (ret)
@@ -679,26 +682,23 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 		if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
 			continue;
 
-		rcu_read_lock();
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-		if (!ca) {
-			rcu_read_unlock();
-			continue;
-		}
+		bool empty;
+		{
+			/* scoped_guard() is a loop, so it breaks continue */
+			guard(rcu)();
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+			if (!ca)
+				continue;
 
-		if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) {
-			rcu_read_unlock();
-			continue;
-		}
+			if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
+				continue;
 
-		u64 b = PTR_BUCKET_NR(ca, &p.ptr);
-		if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) {
-			rcu_read_unlock();
-			continue;
-		}
+			u64 b = PTR_BUCKET_NR(ca, &p.ptr);
+			if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
+				continue;
 
-		bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
-		rcu_read_unlock();
+			empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
+		}
 
 		struct bkey_i_backpointer bp;
 		bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
@@ -981,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
 	case KEY_TYPE_btree_ptr_v2: {
 		bool ret = false;
 
-		rcu_read_lock();
+		guard(rcu)();
 		struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
 		while (pos.inode <= k.k->p.inode) {
 			if (pos.inode >= c->sb.nr_devices)
@@ -1009,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
 next:
 			pos = SPOS(pos.inode + 1, 0, 0);
 		}
-		rcu_read_unlock();
 
 		return ret;
 	}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 6840561084ce..fac05948da1c 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca,
 
 static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
 	if (ca)
 		*bucket = bp_pos_to_bucket(ca, bp_pos);
-	rcu_read_unlock();
 	return ca != NULL;
 }
 
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 91b6395421df..f95486729ba5 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1093,42 +1093,41 @@ static int gc_btree_gens_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bkey_i *u;
-	int ret;
 
 	if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
 		return -EROFS;
 
-	rcu_read_lock();
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ca)
-			continue;
+	bool too_stale = false;
+	scoped_guard(rcu) {
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+			if (!ca)
+				continue;
 
-		if (dev_ptr_stale(ca, ptr) > 16) {
-			rcu_read_unlock();
-			goto update;
+			too_stale |= dev_ptr_stale(ca, ptr) > 16;
 		}
+
+		if (!too_stale)
+			bkey_for_each_ptr(ptrs, ptr) {
+				struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+				if (!ca)
+					continue;
+
+				u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+				if (gen_after(*gen, ptr->gen))
+					*gen = ptr->gen;
+			}
 	}
 
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ca)
-			continue;
+	if (too_stale) {
+		struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
+		int ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
 
-		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
-		if (gen_after(*gen, ptr->gen))
-			*gen = ptr->gen;
+		bch2_extent_normalize(c, bkey_i_to_s(u));
 	}
-	rcu_read_unlock();
-	return 0;
-update:
-	u = bch2_bkey_make_mut(trans, iter, &k, 0);
-	ret = PTR_ERR_OR_ZERO(u);
-	if (ret)
-		return ret;
 
-	bch2_extent_normalize(c, bkey_i_to_s(u));
 	return 0;
 }
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c19a4b44162a..2e191561d578 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1325,14 +1325,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_node_reset_sib_u64s(b);
 
-	rcu_read_lock();
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-		struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+	scoped_guard(rcu)
+		bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+			struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
 
-		if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
-			set_btree_node_need_rewrite(b);
-	}
-	rcu_read_unlock();
+			if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
+				set_btree_node_need_rewrite(b);
+		}
 
 	if (!ptr_written)
 		set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5affa5fc22f4..c7606e0b113d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -3568,13 +3568,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
 				      struct btree_bkey_cached_common *b)
 {
 	struct six_lock_count c = six_lock_counts(&b->lock);
-	struct task_struct *owner;
 	pid_t pid;
 
-	rcu_read_lock();
-	owner = READ_ONCE(b->lock.owner);
-	pid = owner ? owner->pid : 0;
-	rcu_read_unlock();
+	scoped_guard(rcu) {
+		struct task_struct *owner = READ_ONCE(b->lock.owner);
+		pid = owner ? owner->pid : 0;
+	}
 
 	prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
 	bch2_btree_id_to_text(out, b->btree_id);
@@ -3603,7 +3602,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 	prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
 
 	/* trans->paths is rcu protected vs. freeing */
-	rcu_read_lock();
+	guard(rcu)();
 	out->atomic++;
 
 	struct btree_path *paths = rcu_dereference(trans->paths);
@@ -3646,7 +3645,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 	}
 out:
 	--out->atomic;
-	rcu_read_unlock();
 }
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index ade3b5addd75..48863e6925e0 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -457,11 +457,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
 
 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
-	struct bkey_s_c ret = bkey_s_c_null;
-
 	journal_iter_verify(iter);
 
-	rcu_read_lock();
+	guard(rcu)();
 	while (iter->idx < iter->keys->size) {
 		struct journal_key *k = iter->keys->data + iter->idx;
 
@@ -470,19 +468,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 			break;
 		BUG_ON(cmp);
 
-		if (!k->overwritten) {
-			ret = bkey_i_to_s_c(k->k);
-			break;
-		}
+		if (!k->overwritten)
+			return bkey_i_to_s_c(k->k);
 
 		if (k->overwritten_range)
 			iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
 		else
 			bch2_journal_iter_advance(iter);
 	}
-	rcu_read_unlock();
 
-	return ret;
+	return bkey_s_c_null;
 }
 
 static void bch2_journal_iter_exit(struct journal_iter *iter)
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9da950e7eb7d..e954b19756c0 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -187,27 +187,23 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
 static struct bkey_cached *
 bkey_cached_reuse(struct btree_key_cache *c)
 {
-	struct bucket_table *tbl;
+
+	guard(rcu)();
+	struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 	struct rhash_head *pos;
 	struct bkey_cached *ck;
-	unsigned i;
 
-	rcu_read_lock();
-	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
-	for (i = 0; i < tbl->size; i++)
+	for (unsigned i = 0; i < tbl->size; i++)
 		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 			    bkey_cached_lock_for_evict(ck)) {
 				if (bkey_cached_evict(c, ck))
-					goto out;
+					return ck;
 				six_unlock_write(&ck->c.lock);
 				six_unlock_intent(&ck->c.lock);
 			}
 		}
-	ck = NULL;
-out:
-	rcu_read_unlock();
-	return ck;
+	return NULL;
 }
 
 static int btree_key_cache_create(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 09ae5a8c6874..47035aae232e 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -311,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 	lock_graph_down(&g, trans);
 
 	/* trans->paths is rcu protected vs. freeing */
-	rcu_read_lock();
+	guard(rcu)();
 	if (cycle)
 		cycle->atomic++;
 next:
@@ -409,7 +409,6 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
 out:
 	if (cycle)
 		--cycle->atomic;
-	rcu_read_unlock();
 	return ret;
 }
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 410e0116917f..56bd8f66295c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -299,9 +299,8 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
 		if (ret)
 			goto err;
 
-		rcu_read_lock();
-		bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
-		rcu_read_unlock();
+		scoped_guard(rcu)
+			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
 
 		if (level) {
 			/*
@@ -310,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
 			 * sort it out:
 			 */
 			struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			rcu_read_lock();
-			bkey_for_each_ptr(ptrs, ptr) {
-				struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
-				ptr->gen = g->gen;
-			}
-			rcu_read_unlock();
+			scoped_guard(rcu)
+				bkey_for_each_ptr(ptrs, ptr) {
+					struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+					ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
+				}
 		} else {
 			struct bkey_ptrs ptrs;
 			union bch_extent_entry *entry;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index af1532de4a37..49a3807a5eab 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
 
 static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
 {
-	rcu_read_lock();
-	int ret = bucket_gen_get_rcu(ca, b);
-	rcu_read_unlock();
-	return ret;
+	guard(rcu)();
+	return bucket_gen_get_rcu(ca, b);
 }
 
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_
  */
 static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
 {
-	rcu_read_lock();
-	int ret = dev_ptr_stale_rcu(ca, ptr);
-	rcu_read_unlock();
-	return ret;
+	guard(rcu)();
+	return dev_ptr_stale_rcu(ca, ptr);
 }
 
 /* Device usage: */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4066946b26bc..2db30eb134f3 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -613,13 +613,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
 	if (!dev)
 		return -EINVAL;
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_online_member_rcu(c, ca)
-		if (ca->dev == dev) {
-			rcu_read_unlock();
+		if (ca->dev == dev)
 			return ca->dev_idx;
-		}
-	rcu_read_unlock();
 
 	return -BCH_ERR_ENOENT_dev_idx_not_found;
 }
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index fafe7a57ea41..5c687ed1bcb2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -376,21 +376,21 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
 
 		/* Now, drop excess replicas: */
-		rcu_read_lock();
+		scoped_guard(rcu) {
 restart_drop_extra_replicas:
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
-			unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+			bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+				unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
 
-			if (!p.ptr.cached &&
-			    durability - ptr_durability >= m->op.opts.data_replicas) {
-				durability -= ptr_durability;
+				if (!p.ptr.cached &&
+				    durability - ptr_durability >= m->op.opts.data_replicas) {
+					durability -= ptr_durability;
 
-				bch2_extent_ptr_set_cached(c, &m->op.opts,
-							   bkey_i_to_s(insert), &entry->ptr);
-				goto restart_drop_extra_replicas;
+					bch2_extent_ptr_set_cached(c, &m->op.opts,
+								   bkey_i_to_s(insert), &entry->ptr);
+					goto restart_drop_extra_replicas;
+				}
 			}
 		}
-		rcu_read_unlock();
 
 		/* Finally, add the pointers we just wrote: */
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -782,7 +782,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 	darray_for_each(m->op.devs_have, i)
 		__clear_bit(*i, devs.d);
 
-	rcu_read_lock();
+	guard(rcu)();
+
 	unsigned nr_replicas = 0, i;
 	for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
 		struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
@@ -799,7 +800,6 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 		if (nr_replicas >= m->op.nr_replicas)
 			break;
 	}
-	rcu_read_unlock();
 
 	if (!nr_replicas)
 		return -BCH_ERR_data_update_done_no_rw_devs;
@@ -876,7 +876,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	unsigned ptr_bit = 1;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		if (!p.ptr.cached) {
-			rcu_read_lock();
+			guard(rcu)();
 			if (ptr_bit & m->data_opts.rewrite_ptrs) {
 				if (crc_is_compressed(p.crc))
 					reserve_sectors += k.k->size;
@@ -887,7 +887,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 				bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
 				durability_have += bch2_extent_ptr_durability(c, &p);
 			}
-			rcu_read_unlock();
 		}
 
 		/*
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 04db3e0ed82b..901f643ead83 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -510,27 +510,27 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
 	i->ret	= 0;
 
 	do {
-		struct bucket_table *tbl;
-		struct rhash_head *pos;
-		struct btree *b;
-
 		ret = bch2_debugfs_flush_buf(i);
 		if (ret)
 			return ret;
 
-		rcu_read_lock();
 		i->buf.atomic++;
-		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
-					  &c->btree_cache.table);
-		if (i->iter < tbl->size) {
-			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
-				bch2_cached_btree_node_to_text(&i->buf, c, b);
-			i->iter++;
-		} else {
-			done = true;
+		scoped_guard(rcu) {
+			struct bucket_table *tbl =
+				rht_dereference_rcu(c->btree_cache.table.tbl,
+						    &c->btree_cache.table);
+			if (i->iter < tbl->size) {
+				struct rhash_head *pos;
+				struct btree *b;
+
+				rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+					bch2_cached_btree_node_to_text(&i->buf, c, b);
+				i->iter++;
+			} else {
+				done = true;
+			}
 		}
 		--i->buf.atomic;
-		rcu_read_unlock();
 	} while (!done);
 
 	if (i->buf.allocation_failure)
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b3840ff7c407..6e4a68263dfb 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -897,8 +897,8 @@ int bch2_accounting_read(struct bch_fs *c)
 		case BCH_DISK_ACCOUNTING_replicas:
 			fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
 			break;
-		case BCH_DISK_ACCOUNTING_dev_data_type:
-			rcu_read_lock();
+		case BCH_DISK_ACCOUNTING_dev_data_type: {
+			guard(rcu)();
 			struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
 			if (ca) {
 				struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
@@ -910,9 +910,9 @@ int bch2_accounting_read(struct bch_fs *c)
 				    k.dev_data_type.data_type == BCH_DATA_journal)
 					usage->hidden += v[0] * ca->mi.bucket_size;
 			}
-			rcu_read_unlock();
 			break;
 		}
+		}
 	}
 	preempt_enable();
 fsck_err:
@@ -1006,18 +1006,17 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 			case BCH_DISK_ACCOUNTING_replicas:
 				fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
 				break;
-			case BCH_DISK_ACCOUNTING_dev_data_type: {
-				rcu_read_lock();
-				struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
-				if (!ca) {
-					rcu_read_unlock();
-					continue;
-				}
+			case BCH_DISK_ACCOUNTING_dev_data_type:
+				{
+					guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
+					struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+					if (!ca)
+						continue;
 
-				v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
-				v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
-				v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
-				rcu_read_unlock();
+					v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+					v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+					v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
+				}
 
 				if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
 					struct printbuf buf = PRINTBUF;
@@ -1032,7 +1031,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 					mismatch = true;
 				}
 			}
-			}
 
 			0;
 		})));
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index f6098e33ab30..d61abebf3e0b 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
 		case BCH_DISK_ACCOUNTING_replicas:
 			fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
 			break;
-		case BCH_DISK_ACCOUNTING_dev_data_type:
-			rcu_read_lock();
+		case BCH_DISK_ACCOUNTING_dev_data_type: {
+			guard(rcu)();
 			struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
 			if (ca) {
 				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
 				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
 				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
 			}
-			rcu_read_unlock();
 			break;
 		}
+		}
 	}
 
 	unsigned idx;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index c20ecf5e5381..9a17ef78f647 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -170,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
 {
 	struct target t = target_decode(target);
-	struct bch_devs_mask *devs;
 
-	rcu_read_lock();
+	guard(rcu)();
 
 	switch (t.type) {
 	case TARGET_NULL:
-		devs = NULL;
-		break;
+		return NULL;
 	case TARGET_DEV: {
 		struct bch_dev *ca = t.dev < c->sb.nr_devices
 			? rcu_dereference(c->devs[t.dev])
 			: NULL;
-		devs = ca ? &ca->self : NULL;
-		break;
+		return ca ? &ca->self : NULL;
 	}
 	case TARGET_GROUP: {
 		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 
-		devs = g && t.group < g->nr && !g->entries[t.group].deleted
+		return g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;
-		break;
 	}
 	default:
 		BUG();
 	}
-
-	rcu_read_unlock();
-
-	return devs;
 }
 
 bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
@@ -384,7 +376,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
 	bch2_printbuf_make_room(out, 4096);
 
 	out->atomic++;
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 
 	for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
@@ -405,16 +397,14 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
 		prt_newline(out);
 	}
 
-	rcu_read_unlock();
 	out->atomic--;
 }
 
 void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 {
 	out->atomic++;
-	rcu_read_lock();
+	guard(rcu)();
 	__bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
-	rcu_read_unlock();
 	--out->atomic;
 }
 
@@ -535,13 +525,11 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 	switch (t.type) {
 	case TARGET_NULL:
 		prt_printf(out, "none");
-		break;
+		return;
 	case TARGET_DEV: {
-		struct bch_dev *ca;
-
 		out->atomic++;
-		rcu_read_lock();
-		ca = t.dev < c->sb.nr_devices
+		guard(rcu)();
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
 			? rcu_dereference(c->devs[t.dev])
 			: NULL;
 
@@ -552,13 +540,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 		else
 			prt_printf(out, "invalid device %u", t.dev);
 
-		rcu_read_unlock();
 		out->atomic--;
-		break;
+		return;
 	}
 	case TARGET_GROUP:
 		bch2_disk_path_to_text(out, c, t.group);
-		break;
+		return;
 	default:
 		BUG();
 	}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c581426e3894..0a5f3a2e45a2 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1578,26 +1578,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str
 static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
 {
 	struct bch_devs_mask devs = h->devs;
+	unsigned nr_devs, nr_devs_with_durability;
 
-	rcu_read_lock();
-	h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
-				 ? group_to_target(h->disk_label - 1)
-				 : 0);
-	unsigned nr_devs = dev_mask_nr(&h->devs);
+	scoped_guard(rcu) {
+		h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+					 ? group_to_target(h->disk_label - 1)
+					 : 0);
+		nr_devs = dev_mask_nr(&h->devs);
 
-	for_each_member_device_rcu(c, ca, &h->devs)
-		if (!ca->mi.durability)
-			__clear_bit(ca->dev_idx, h->devs.d);
-	unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
+		for_each_member_device_rcu(c, ca, &h->devs)
+			if (!ca->mi.durability)
+				__clear_bit(ca->dev_idx, h->devs.d);
+		nr_devs_with_durability = dev_mask_nr(&h->devs);
 
-	h->blocksize = pick_blocksize(c, &h->devs);
+		h->blocksize = pick_blocksize(c, &h->devs);
 
-	h->nr_active_devs = 0;
-	for_each_member_device_rcu(c, ca, &h->devs)
-		if (ca->mi.bucket_size == h->blocksize)
-			h->nr_active_devs++;
-
-	rcu_read_unlock();
+		h->nr_active_devs = 0;
+		for_each_member_device_rcu(c, ca, &h->devs)
+			if (ca->mi.bucket_size == h->blocksize)
+				h->nr_active_devs++;
+	}
 
 	/*
 	 * If we only have redundancy + 1 devices, we're better off with just
@@ -2141,15 +2141,14 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
 
 	unsigned nr_good = 0;
 
-	rcu_read_lock();
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (ptr->dev == dev_idx)
-			ptr->dev = BCH_SB_MEMBER_INVALID;
+	scoped_guard(rcu)
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (ptr->dev == dev_idx)
+				ptr->dev = BCH_SB_MEMBER_INVALID;
 
-		struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev);
-		nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
-	}
-	rcu_read_unlock();
+			struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev);
+			nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
+		}
 
 	if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
 		return -BCH_ERR_remove_would_lose_data;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 1ac9897f189d..677cf453b332 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -65,15 +65,15 @@ void bch2_io_failures_to_text(struct printbuf *out,
 			continue;
 
 		bch2_printbuf_make_room(out, 1024);
-		rcu_read_lock();
 		out->atomic++;
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
-		if (ca)
-			prt_str(out, ca->name);
-		else
-			prt_printf(out, "(invalid device %u)", f->dev);
+		scoped_guard(rcu) {
+			struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
+			if (ca)
+				prt_str(out, ca->name);
+			else
+				prt_printf(out, "(invalid device %u)", f->dev);
+		}
 		--out->atomic;
-		rcu_read_unlock();
 
 		prt_char(out, ' ');
 
@@ -407,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
 	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
 
+	guard(rcu)();
+
 	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
 	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
 		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
@@ -418,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 			return false;
 
 		/* Extents may not straddle buckets: */
-		rcu_read_lock();
 		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
 		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
-		rcu_read_unlock();
 
 		if (!same_bucket)
 			return false;
@@ -838,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 	struct extent_ptr_decoded p;
 	unsigned durability = 0;
 
-	rcu_read_lock();
+	guard(rcu)();
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		durability += bch2_extent_ptr_durability(c, &p);
-	rcu_read_unlock();
-
 	return durability;
 }
 
@@ -853,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
 	struct extent_ptr_decoded p;
 	unsigned durability = 0;
 
-	rcu_read_lock();
+	guard(rcu)();
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
 			durability += bch2_extent_ptr_durability(c, &p);
-	rcu_read_unlock();
-
 	return durability;
 }
 
@@ -1015,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	struct bch_dev *ca;
-	bool ret = false;
 
-	rcu_read_lock();
+	guard(rcu)();
 	bkey_for_each_ptr(ptrs, ptr)
 		if (bch2_dev_in_target(c, ptr->dev, target) &&
 		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
 		    (!ptr->cached ||
-		     !dev_ptr_stale_rcu(ca, ptr))) {
-			ret = true;
-			break;
-		}
-	rcu_read_unlock();
+		     !dev_ptr_stale_rcu(ca, ptr)))
+			return true;
 
-	return ret;
+	return false;
 }
 
 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -1142,7 +1134,7 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
 	bool have_cached_ptr;
 	unsigned drop_dev = ptr->dev;
 
-	rcu_read_lock();
+	guard(rcu)();
 restart_drop_ptrs:
 	ptrs = bch2_bkey_ptrs(k);
 	have_cached_ptr = false;
@@ -1175,10 +1167,8 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
 		goto drop;
 
 	ptr->cached = true;
-	rcu_read_unlock();
 	return;
 drop:
-	rcu_read_unlock();
 	bch2_bkey_drop_ptr_noerror(k, ptr);
 }
 
@@ -1194,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 {
 	struct bch_dev *ca;
 
-	rcu_read_lock();
+	guard(rcu)();
 	bch2_bkey_drop_ptrs(k, ptr,
 		ptr->cached &&
 		(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
 		 dev_ptr_stale_rcu(ca, ptr) > 0));
-	rcu_read_unlock();
 
 	return bkey_deleted(k.k);
 }
@@ -1217,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c,
 	struct bkey_ptrs ptrs;
 	bool have_cached_ptr;
 
-	rcu_read_lock();
+	guard(rcu)();
 restart_drop_ptrs:
 	ptrs = bch2_bkey_ptrs(k);
 	have_cached_ptr = false;
@@ -1230,7 +1219,6 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c,
 			}
 			have_cached_ptr = true;
 		}
-	rcu_read_unlock();
 
 	return bkey_deleted(k.k);
 }
@@ -1238,7 +1226,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c,
 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
 {
 	out->atomic++;
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
 	if (!ca) {
 		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
@@ -1262,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
 		else if (stale)
 			prt_printf(out, " invalid");
 	}
-	rcu_read_unlock();
 	--out->atomic;
 }
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b1e9ee28fc0f..a233f45875e9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -71,12 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
 	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
 
 	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-		rcu_read_lock();
-		ca = rcu_dereference(c->devs[dev]);
-		if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
-					BCH_DEV_WRITE_REF_nocow_flush))
-			ca = NULL;
-		rcu_read_unlock();
+		scoped_guard(rcu) {
+			ca = rcu_dereference(c->devs[dev]);
+			if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
+							 BCH_DEV_WRITE_REF_nocow_flush))
+				ca = NULL;
+		}
 
 		if (!ca)
 			continue;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f52c7db16dec..ebf967e82d8a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2327,14 +2327,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
 	struct bch_fs *c = root->d_sb->s_fs_info;
 	bool first = true;
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_online_member_rcu(c, ca) {
 		if (!first)
 			seq_putc(seq, ':');
 		first = false;
 		seq_puts(seq, ca->disk_sb.sb_name);
 	}
-	rcu_read_unlock();
 
 	return 0;
 }
@@ -2531,16 +2530,16 @@ static int bch2_fs_get_tree(struct fs_context *fc)
 
 	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
 
-	rcu_read_lock();
-	for_each_online_member_rcu(c, ca) {
-		struct block_device *bdev = ca->disk_sb.bdev;
+	scoped_guard(rcu) {
+		for_each_online_member_rcu(c, ca) {
+			struct block_device *bdev = ca->disk_sb.bdev;
 
-		/* XXX: create an anonymous device for multi device filesystems */
-		sb->s_bdev	= bdev;
-		sb->s_dev	= bdev->bd_dev;
-		break;
+			/* XXX: create an anonymous device for multi device filesystems */
+			sb->s_bdev	= bdev;
+			sb->s_dev	= bdev->bd_dev;
+			break;
+		}
 	}
-	rcu_read_unlock();
 
 	c->dev = sb->s_dev;
 
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index cc708d46557e..fb83c6942485 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -56,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
 	if (!target)
 		return false;
 
-	rcu_read_lock();
+	guard(rcu)();
 	devs = bch2_target_to_mask(c, target) ?:
 		&c->rw_devs[BCH_DATA_user];
 
@@ -73,7 +73,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
 		total += max(congested, 0LL);
 		nr++;
 	}
-	rcu_read_unlock();
 
 	return get_random_u32_below(nr * CONGESTED_MAX) < total;
 }
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 52a60982a66b..b9d624e8f099 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1208,16 +1208,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
 
 	e = bkey_s_c_to_extent(k);
 
-	rcu_read_lock();
+	guard(rcu)();
 	extent_for_each_ptr_decode(e, p, entry) {
-		if (crc_is_encoded(p.crc) || p.has_ec) {
-			rcu_read_unlock();
+		if (crc_is_encoded(p.crc) || p.has_ec)
 			return false;
-		}
 
 		replicas += bch2_extent_ptr_durability(c, &p);
 	}
-	rcu_read_unlock();
 
 	return replicas >= op->opts.data_replicas;
 }
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index fd7f9ff33da0..0348ab3276f4 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -708,10 +708,9 @@ static unsigned max_dev_latency(struct bch_fs *c)
 {
 	u64 nsecs = 0;
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_rw_member_rcu(c, ca)
 		nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
-	rcu_read_unlock();
 
 	return nsecs_to_jiffies(nsecs);
 }
@@ -1732,7 +1731,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	printbuf_tabstop_push(out, 28);
 	out->atomic++;
 
-	rcu_read_lock();
+	guard(rcu)();
 	s = READ_ONCE(j->reservations);
 
 	prt_printf(out, "flags:\t");
@@ -1823,8 +1822,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 
 	prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
 
-	rcu_read_unlock();
-
 	--out->atomic;
 }
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 52297057531b..8ce41753eadb 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1521,7 +1521,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-	rcu_read_lock();
+	guard(rcu)();
 	darray_for_each(*devs, i) {
 		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
 		if (!ca)
@@ -1543,7 +1543,6 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
 			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
 		}
 	}
-	rcu_read_unlock();
 }
 
 static void __journal_write_alloc(struct journal *j,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index d0604218bf65..67be0c33f70c 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -148,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 
 	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
 
-	rcu_read_lock();
 	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
 		if (!ca->journal.nr ||
 		    !ca->mi.durability)
@@ -166,7 +165,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
 
 		array_insert_item(dev_space, nr_devs, pos, space);
 	}
-	rcu_read_unlock();
 
 	if (nr_devs < nr_devs_want)
 		return (struct journal_space) { 0, 0 };
@@ -191,8 +189,8 @@ void bch2_journal_space_available(struct journal *j)
 	int ret = 0;
 
 	lockdep_assert_held(&j->lock);
+	guard(rcu)();
 
-	rcu_read_lock();
 	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
 		struct journal_device *ja = &ca->journal;
 
@@ -212,7 +210,6 @@ void bch2_journal_space_available(struct journal *j)
 		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 		nr_online++;
 	}
-	rcu_read_unlock();
 
 	j->can_discard = can_discard;
 
@@ -223,10 +220,8 @@ void bch2_journal_space_available(struct journal *j)
 			prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
 				   "rw journal devs:", nr_online, metadata_replicas_required(c));
 
-			rcu_read_lock();
 			for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
 				prt_printf(&buf, " %s", ca->name);
-			rcu_read_unlock();
 
 			bch_err(c, "%s", buf.buf);
 			printbuf_exit(&buf);
@@ -626,9 +621,9 @@ static u64 journal_seq_to_flush(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	u64 seq_to_flush = 0;
 
-	spin_lock(&j->lock);
+	guard(spinlock)(&j->lock);
+	guard(rcu)();
 
-	rcu_read_lock();
 	for_each_rw_member_rcu(c, ca) {
 		struct journal_device *ja = &ca->journal;
 		unsigned nr_buckets, bucket_to_flush;
@@ -643,15 +638,11 @@ static u64 journal_seq_to_flush(struct journal *j)
 		seq_to_flush = max(seq_to_flush,
 				   ja->bucket_seq[bucket_to_flush]);
 	}
-	rcu_read_unlock();
 
 	/* Also flush if the pin fifo is more than half full */
-	seq_to_flush = max_t(s64, seq_to_flush,
-			     (s64) journal_cur_seq(j) -
-			     (j->pin.size >> 1));
-	spin_unlock(&j->lock);
-
-	return seq_to_flush;
+	return max_t(s64, seq_to_flush,
+		     (s64) journal_cur_seq(j) -
+		     (j->pin.size >> 1));
 }
 
 /**
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 2f63fc6d456f..57b5b3263b08 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -145,13 +145,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c,
 	case BCH_LRU_fragmentation: {
 		a = bch2_alloc_to_v4(k, &a_convert);
 
-		rcu_read_lock();
+		guard(rcu)();
 		struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
-		u64 idx = ca
+		return ca
 			? alloc_lru_idx_fragmentation(*a, ca)
 			: 0;
-		rcu_read_unlock();
-		return idx;
 	}
 	case BCH_LRU_stripes:
 		return k.k->type == KEY_TYPE_stripe
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fc1a7a04cb15..5dafc0018b0e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1176,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
 		? c->opts.metadata_replicas
 		: io_opts->data_replicas;
 
-	rcu_read_lock();
+	guard(rcu)();
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	unsigned i = 0;
 	bkey_for_each_ptr(ptrs, ptr) {
@@ -1186,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
 			data_opts->kill_ptrs |= BIT(i);
 		i++;
 	}
-	rcu_read_unlock();
 
 	if (!data_opts->kill_ptrs &&
 	    (!nr_good || nr_good >= replicas))
@@ -1294,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
 	struct extent_ptr_decoded p;
 	unsigned i = 0;
 
-	rcu_read_lock();
+	guard(rcu)();
 	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
 		unsigned d = bch2_extent_ptr_durability(c, &p);
 
@@ -1305,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
 
 		i++;
 	}
-	rcu_read_unlock();
 
 	return data_opts->kill_ptrs != 0;
 }
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index e7a2a13554d7..6d7b1d5f7697 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -293,11 +293,9 @@ u64 bch2_copygc_wait_amount(struct bch_fs *c)
 {
 	u64 wait = U64_MAX;
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_rw_member_rcu(c, ca)
 		wait = min(wait, bch2_copygc_dev_wait_amount(ca));
-	rcu_read_unlock();
-
 	return wait;
 }
 
@@ -321,21 +319,21 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 
 	bch2_printbuf_make_room(out, 4096);
 
-	rcu_read_lock();
+	struct task_struct *t;
 	out->atomic++;
+	scoped_guard(rcu) {
+		prt_printf(out, "Currently calculated wait:\n");
+		for_each_rw_member_rcu(c, ca) {
+			prt_printf(out, "  %s:\t", ca->name);
+			prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
+			prt_newline(out);
+		}
 
-	prt_printf(out, "Currently calculated wait:\n");
-	for_each_rw_member_rcu(c, ca) {
-		prt_printf(out, "  %s:\t", ca->name);
-		prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
-		prt_newline(out);
+		t = rcu_dereference(c->copygc_thread);
+		if (t)
+			get_task_struct(t);
 	}
-
-	struct task_struct *t = rcu_dereference(c->copygc_thread);
-	if (t)
-		get_task_struct(t);
 	--out->atomic;
-	rcu_read_unlock();
 
 	if (t) {
 		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index b9683d22bab0..f615910d6f98 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -7,11 +7,10 @@ void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void bch2_copygc_wakeup(struct bch_fs *c)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	struct task_struct *p = rcu_dereference(c->copygc_thread);
 	if (p)
 		wake_up_process(p);
-	rcu_read_unlock();
 }
 
 void bch2_copygc_stop(struct bch_fs *);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index dbaabaad1986..c563797ec5d5 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -80,13 +80,12 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
 	unsigned ptr_bit = 1;
 	unsigned rewrite_ptrs = 0;
 
-	rcu_read_lock();
+	guard(rcu)();
 	bkey_for_each_ptr(ptrs, ptr) {
 		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
 			rewrite_ptrs |= ptr_bit;
 		ptr_bit <<= 1;
 	}
-	rcu_read_unlock();
 
 	return rewrite_ptrs;
 }
@@ -135,12 +134,11 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
 	}
 incompressible:
 	if (opts->background_target) {
-		rcu_read_lock();
+		guard(rcu)();
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 			if (!p.ptr.cached &&
 			    !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
 				sectors += p.crc.compressed_size;
-		rcu_read_unlock();
 	}
 
 	return sectors;
@@ -679,11 +677,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
 	}
 	prt_newline(out);
 
-	rcu_read_lock();
-	struct task_struct *t = rcu_dereference(c->rebalance.thread);
-	if (t)
-		get_task_struct(t);
-	rcu_read_unlock();
+	struct task_struct *t;
+	scoped_guard(rcu) {
+		t = rcu_dereference(c->rebalance.thread);
+		if (t)
+			get_task_struct(t);
+	}
 
 	if (t) {
 		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 477ef0997949..4698f8866cd2 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -819,20 +819,19 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 		if (e->data_type == BCH_DATA_cached)
 			continue;
 
-		rcu_read_lock();
-		for (unsigned i = 0; i < e->nr_devs; i++) {
-			if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
-				nr_failed++;
-				continue;
+		scoped_guard(rcu)
+			for (unsigned i = 0; i < e->nr_devs; i++) {
+				if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+					nr_failed++;
+					continue;
+				}
+
+				nr_online += test_bit(e->devs[i], devs.d);
+
+				struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
+				nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
 			}
 
-			nr_online += test_bit(e->devs[i], devs.d);
-
-			struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
-			nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-		}
-		rcu_read_unlock();
-
 		if (nr_online + nr_failed == e->nr_devs)
 			continue;
 
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 3398906660a5..ab5673b34e1d 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -378,14 +378,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c)
 {
 	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_member_device_rcu(c, ca, NULL) {
 		struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
 
 		for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
 			m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
 	}
-	rcu_read_unlock();
 }
 
 void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
@@ -443,20 +442,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
 
 bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
 {
-	bool ret = true;
-	rcu_read_lock();
+	guard(rcu)();
 	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
 		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ca)
-			continue;
-
-		if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
-			ret = false;
-			break;
-		}
+		if (ca &&
+		    !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
+			return false;
 	}
-	rcu_read_unlock();
-	return ret;
+	return true;
 }
 
 static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 6bd9b86aee5b..8d8a8a857648 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -28,12 +28,9 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
 
 static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_dev *ca = bch2_dev_rcu(c, dev);
-	bool ret = ca && bch2_dev_is_online(ca);
-	rcu_read_unlock();
-
-	return ret;
+	return ca && bch2_dev_is_online(ca);
 }
 
 static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
@@ -142,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca)
 
 static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	bch2_dev_put(ca);
 	if ((ca = __bch2_next_dev(c, ca, NULL)))
 		bch2_dev_get(ca);
-	rcu_read_unlock();
-
 	return ca;
 }
 
@@ -166,7 +161,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 						       unsigned state_mask,
 						       int rw, unsigned ref_idx)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	if (ca)
 		enumerated_ref_put(&ca->io_ref[rw], ref_idx);
 
@@ -174,7 +169,6 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	       (!((1 << ca->mi.state) & state_mask) ||
 		!enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
 		;
-	rcu_read_unlock();
 
 	return ca;
 }
@@ -239,11 +233,10 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
 
 static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
 	if (ca)
 		bch2_dev_get(ca);
-	rcu_read_unlock();
 	return ca;
 }
 
@@ -299,19 +292,16 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
 {
 	might_sleep();
 
-	rcu_read_lock();
+	guard(rcu)();
 	struct bch_dev *ca = bch2_dev_rcu(c, dev);
-	if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
-		ca = NULL;
-	rcu_read_unlock();
+	if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
+		return NULL;
 
-	if (ca &&
-	    (ca->mi.state == BCH_MEMBER_STATE_rw ||
-	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
 		return ca;
 
-	if (ca)
-		enumerated_ref_put(&ca->io_ref[rw], ref_idx);
+	enumerated_ref_put(&ca->io_ref[rw], ref_idx);
 	return NULL;
 }
 
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 7c403427fbdb..538c324f4765 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock)
 	 * acquiring the lock and setting the owner field. If we're an RT task
 	 * that will live-lock because we won't let the owner complete.
 	 */
-	rcu_read_lock();
+	guard(rcu)();
 	struct task_struct *owner = READ_ONCE(lock->owner);
-	bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
-	rcu_read_unlock();
-
-	return ret;
+	return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
 }
 
 static inline bool six_optimistic_spin(struct six_lock *lock,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index cf9a65e858f6..612c526a94dd 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -105,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id,
 
 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
 {
-	rcu_read_lock();
-	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
-	rcu_read_unlock();
-
-	return ret;
+	guard(rcu)();
+	return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
 }
 
 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -140,13 +137,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
 	bool ret;
 
-	rcu_read_lock();
+	guard(rcu)();
 	struct snapshot_table *t = rcu_dereference(c->snapshots);
 
-	if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
-		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
-		goto out;
-	}
+	if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
+		return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
 
 	if (likely(ancestor >= IS_ANCESTOR_BITMAP))
 		while (id && id < ancestor - IS_ANCESTOR_BITMAP)
@@ -157,9 +152,6 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 		: id == ancestor;
 
 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
-out:
-	rcu_read_unlock();
-
 	return ret;
 }
 
@@ -412,10 +404,10 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
 u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
 				snapshot_id_list *skip)
 {
+	guard(rcu)();
 	u32 id, subvol = 0, s;
 retry:
 	id = snapshot_root;
-	rcu_read_lock();
 	while (id && bch2_snapshot_exists(c, id)) {
 		if (!(skip && snapshot_list_has_id(skip, id))) {
 			s = snapshot_t(c, id)->subvol;
@@ -427,7 +419,6 @@ u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
 		if (id == snapshot_root)
 			break;
 	}
-	rcu_read_unlock();
 
 	if (!subvol && skip) {
 		skip = NULL;
@@ -617,18 +608,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
 
 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
 {
-	const struct snapshot_t *s;
-
 	if (!id)
 		return 0;
 
-	rcu_read_lock();
-	s = snapshot_t(c, id);
-	if (s->parent)
-		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
-	rcu_read_unlock();
-
-	return id;
+	guard(rcu)();
+	const struct snapshot_t *s = snapshot_t(c, id);
+	return s->parent
+		? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
+		: id;
 }
 
 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
@@ -1458,11 +1445,9 @@ static unsigned live_child(struct bch_fs *c, u32 id)
 {
 	struct snapshot_delete *d = &c->snapshot_delete;
 
-	rcu_read_lock();
-	u32 ret = __live_child(rcu_dereference(c->snapshots), id,
-			       &d->delete_leaves, &d->delete_interior);
-	rcu_read_unlock();
-	return ret;
+	guard(rcu)();
+	return __live_child(rcu_dereference(c->snapshots), id,
+			    &d->delete_leaves, &d->delete_interior);
 }
 
 static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
@@ -1719,7 +1704,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
 						interior_delete_list *skip)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	while (interior_delete_has_id(skip, id))
 		id = __bch2_snapshot_parent(c, id);
 
@@ -1728,7 +1713,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
 			id = __bch2_snapshot_parent(c, id);
 		} while (interior_delete_has_id(skip, id));
 	}
-	rcu_read_unlock();
 
 	return id;
 }
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index ee79f81f175c..6766bf673ed9 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
 
 static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	const struct snapshot_t *s = snapshot_t(c, id);
-	id = s ? s->tree : 0;
-	rcu_read_unlock();
-
-	return id;
+	return s ? s->tree : 0;
 }
 
 static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
 
 static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
-	id = __bch2_snapshot_parent_early(c, id);
-	rcu_read_unlock();
-
-	return id;
+	guard(rcu)();
+	return __bch2_snapshot_parent_early(c, id);
 }
 
 static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -88,20 +82,15 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
 
 static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
-	id = __bch2_snapshot_parent(c, id);
-	rcu_read_unlock();
-
-	return id;
+	guard(rcu)();
+	return __bch2_snapshot_parent(c, id);
 }
 
 static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	while (n--)
 		id = __bch2_snapshot_parent(c, id);
-	rcu_read_unlock();
-
 	return id;
 }
 
@@ -110,13 +99,11 @@ u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
 
 static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
 {
-	u32 parent;
+	guard(rcu)();
 
-	rcu_read_lock();
+	u32 parent;
 	while ((parent = __bch2_snapshot_parent(c, id)))
 		id = parent;
-	rcu_read_unlock();
-
 	return id;
 }
 
@@ -128,11 +115,8 @@ static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c,
 
 static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
-	enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id);
-	rcu_read_unlock();
-
-	return ret;
+	guard(rcu)();
+	return __bch2_snapshot_id_state(c, id);
 }
 
 static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
@@ -142,12 +126,9 @@ static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
 
 static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	const struct snapshot_t *s = snapshot_t(c, id);
-	int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
-	rcu_read_unlock();
-
-	return ret;
+	return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
 }
 
 static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
@@ -160,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
 
 static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
 {
-	u32 depth;
-
-	rcu_read_lock();
-	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
-	rcu_read_unlock();
-
-	return depth;
+	guard(rcu)();
+	return parent ? snapshot_t(c, parent)->depth + 1 : 0;
 }
 
 bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
@@ -180,12 +156,9 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
 
 static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
 {
-	rcu_read_lock();
+	guard(rcu)();
 	const struct snapshot_t *t = snapshot_t(c, id);
-	bool ret = t && (t->children[0]|t->children[1]) != 0;
-	rcu_read_unlock();
-
-	return ret;
+	return t && (t->children[0]|t->children[1]) != 0;
 }
 
 static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 35c9f86a73c1..170c5f6b6ce7 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -141,13 +141,9 @@ static int check_subvol(struct btree_trans *trans,
 
 	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
 		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
-		u32 snapshot_tree;
+		u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
+
 		struct bch_snapshot_tree st;
-
-		rcu_read_lock();
-		snapshot_tree = snapshot_t(c, snapshot_root)->tree;
-		rcu_read_unlock();
-
 		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
 
 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index df42a66b8bc3..56ca2e4a415b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -219,23 +219,17 @@ static int bch2_fs_init_rw(struct bch_fs *);
 
 struct bch_fs *bch2_dev_to_fs(dev_t dev)
 {
+	guard(mutex)(&bch_fs_list_lock);
+	guard(rcu)();
+
 	struct bch_fs *c;
-
-	mutex_lock(&bch_fs_list_lock);
-	rcu_read_lock();
-
 	list_for_each_entry(c, &bch_fs_list, list)
 		for_each_member_device_rcu(c, ca, NULL)
 			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
 				closure_get(&c->cl);
-				goto found;
+				return c;
 			}
-	c = NULL;
-found:
-	rcu_read_unlock();
-	mutex_unlock(&bch_fs_list_lock);
-
-	return c;
+	return NULL;
 }
 
 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
@@ -507,13 +501,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
 	clear_bit(BCH_FS_clean_shutdown, &c->flags);
 
-	rcu_read_lock();
-	for_each_online_member_rcu(c, ca)
-		if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-			bch2_dev_allocator_add(c, ca);
-			enumerated_ref_start(&ca->io_ref[WRITE]);
-		}
-	rcu_read_unlock();
+	scoped_guard(rcu)
+		for_each_online_member_rcu(c, ca)
+			if (ca->mi.state == BCH_MEMBER_STATE_rw) {
+				bch2_dev_allocator_add(c, ca);
+				enumerated_ref_start(&ca->io_ref[WRITE]);
+			}
 
 	bch2_recalc_capacity(c);
 
@@ -1184,22 +1177,20 @@ int bch2_fs_start(struct bch_fs *c)
 		goto err;
 	}
 
-	rcu_read_lock();
-	for_each_online_member_rcu(c, ca)
-		bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
-		cpu_to_le64(now);
-	rcu_read_unlock();
+	scoped_guard(rcu)
+		for_each_online_member_rcu(c, ca)
+			bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+			cpu_to_le64(now);
 
 	/*
 	 * Dno't write superblock yet: recovery might have to downgrade
 	 */
 	mutex_unlock(&c->sb_lock);
 
-	rcu_read_lock();
-	for_each_online_member_rcu(c, ca)
-		if (ca->mi.state == BCH_MEMBER_STATE_rw)
-			bch2_dev_allocator_add(c, ca);
-	rcu_read_unlock();
+	scoped_guard(rcu)
+		for_each_online_member_rcu(c, ca)
+			if (ca->mi.state == BCH_MEMBER_STATE_rw)
+				bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 	up_write(&c->state_lock);
 

From 132263220ddacaa5d4a93c34675eb9639dcc3707 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 30 May 2025 19:09:11 -0400
Subject: [PATCH 48/69] bcachefs: Add better logging to fsck_rename_dirent()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/str_hash.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index f101ca8581d9..7b97852d1cdf 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -61,14 +61,19 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
 						(subvol_inum) { 0, old.k->p.inode },
 						old.k->p.snapshot, &new->k_i,
 						BTREE_UPDATE_internal_snapshot_node);
-		if (!bch2_err_matches(ret, EEXIST))
+		if (ret && !bch2_err_matches(ret, EEXIST))
+			goto err;
+		if (!ret)
 			break;
 	}
 
 	if (ret)
-		return ret;
+		goto err;
 
-	return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+	ret = bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+err:
+	bch_err_fn(trans->c, ret);
+	return ret;
 }
 
 static noinline int hash_pick_winner(struct btree_trans *trans,

From 165815c296076132f0399bd892fd879fc597ac83 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 19:20:33 -0400
Subject: [PATCH 49/69] bcachefs: Convert BUG() to error

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 631ee2af8585..61d6d99875ea 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -984,7 +984,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans,
 	int ret = 0;
 
 	if (d->v.d_type == DT_SUBVOL) {
-		BUG();
+		bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
+		ret = -BCH_ERR_fsck_repair_unimplemented;
 	} else {
 		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
 		if (ret)

From d47db3e63679541f4dd4696ce4661b6917cb9684 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 18:47:49 -0400
Subject: [PATCH 50/69] bcachefs: Delete redundant fsck_err()

'inode_has_wrong_backpointer'; we have more specific errors for every
case afterwards.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.h | 9 ---------
 fs/bcachefs/namei.c | 9 ---------
 2 files changed, 18 deletions(-)

diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 77ad2d549541..82cec2836cbd 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -283,15 +283,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
 void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
 
-static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
-{
-	bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
-
-	return S_ISDIR(inode->bi_mode) ||
-		inode->bi_subvol ||
-		(!inode->bi_nlink && inode_has_bp);
-}
-
 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
 void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
 			 struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index c57da4029d36..7ba3dee7eb2d 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -733,15 +733,6 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
 		return __bch2_fsck_write_inode(trans, target);
 	}
 
-	if (bch2_inode_should_have_single_bp(target) &&
-	    !fsck_err(trans, inode_wrong_backpointer,
-		      "dirent points to inode that does not point back:\n%s",
-		      (bch2_bkey_val_to_text(&buf, c, d.s_c),
-		       prt_newline(&buf),
-		       bch2_inode_unpacked_to_text(&buf, target),
-		       buf.buf)))
-		goto err;
-
 	struct bkey_s_c_dirent bp_dirent =
 		bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
 			      SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),

From 95fafc0f3407a6446082c11849df585bd3246571 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 18:32:37 -0400
Subject: [PATCH 51/69] bcachefs: Kill un-reverted directory i_size code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 12 ++----------
 fs/bcachefs/dirent.h |  4 ++--
 fs/bcachefs/namei.c  |  4 ++--
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 37d7cf69ae1d..066134145721 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -402,8 +402,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
 }
 
 int bch2_dirent_rename(struct btree_trans *trans,
-		subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
-		subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
+		subvol_inum src_dir, struct bch_hash_info *src_hash,
+		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
 		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
 		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
 		enum bch_rename_mode mode)
@@ -542,14 +542,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	    new_src->v.d_type == DT_SUBVOL)
 		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
 
-	if (old_dst.k)
-		*dst_dir_i_size -= bkey_bytes(old_dst.k);
-	*src_dir_i_size -= bkey_bytes(old_src.k);
-
-	if (mode == BCH_RENAME_EXCHANGE)
-		*src_dir_i_size += bkey_bytes(&new_src->k);
-	*dst_dir_i_size += bkey_bytes(&new_dst->k);
-
 	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
 	if (ret)
 		goto out;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1f600dedafe1..f94d589cab6f 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -80,8 +80,8 @@ enum bch_rename_mode {
 };
 
 int bch2_dirent_rename(struct btree_trans *,
-		       subvol_inum, struct bch_hash_info *, u64 *,
-		       subvol_inum, struct bch_hash_info *, u64 *,
+		       subvol_inum, struct bch_hash_info *,
+		       subvol_inum, struct bch_hash_info *,
 		       const struct qstr *, subvol_inum *, u64 *,
 		       const struct qstr *, subvol_inum *, u64 *,
 		       enum bch_rename_mode);
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index 7ba3dee7eb2d..ee6fe35cec80 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -425,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans,
 	}
 
 	ret = bch2_dirent_rename(trans,
-				 src_dir, &src_hash, &src_dir_u->bi_size,
-				 dst_dir, &dst_hash, &dst_dir_u->bi_size,
+				 src_dir, &src_hash,
+				 dst_dir, &dst_hash,
 				 src_name, &src_inum, &src_offset,
 				 dst_name, &dst_inum, &dst_offset,
 				 mode);

From 36a2fdf7c5c1ccae6ca16cd14067567096cebe17 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 11:58:11 -0400
Subject: [PATCH 52/69] bcachefs: Repair code for directory i_size

We had a bug due due to an incomplete revert of the patch implementing
directory i_size (summing up the size of the dirents), leading to
completely screwy i_size values that underflow.

Most userspace programs don't seem to care (e.g. du ignores it), but it
turns out this broke sshfs, so needs to be repaired.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c             | 8 ++++++++
 fs/bcachefs/sb-errors_format.h | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 61d6d99875ea..66b7a87a375b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1167,6 +1167,14 @@ static int check_inode(struct btree_trans *trans,
 		ret = 0;
 	}
 
+	if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
+			trans, inode_dir_has_nonzero_i_size,
+			"directory %llu:%u with nonzero i_size %lli",
+			u.bi_inum, u.bi_snapshot, u.bi_size)) {
+		u.bi_size = 0;
+		do_update = true;
+	}
+
 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
 	if (ret < 0)
 		goto err;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 0bfb151da9cf..8999dab1ff9b 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -232,6 +232,7 @@ enum bch_fsck_flags {
 	x(inode_dir_multiple_links,				206,	FSCK_AUTOFIX)	\
 	x(inode_dir_missing_backpointer,			284,	FSCK_AUTOFIX)	\
 	x(inode_dir_unlinked_but_not_empty,			286,	FSCK_AUTOFIX)	\
+	x(inode_dir_has_nonzero_i_size,				319,	FSCK_AUTOFIX)	\
 	x(inode_multiple_links_but_nlink_0,			207,	FSCK_AUTOFIX)	\
 	x(inode_wrong_backpointer,				208,	FSCK_AUTOFIX)	\
 	x(inode_wrong_nlink,					209,	FSCK_AUTOFIX)	\
@@ -328,7 +329,7 @@ enum bch_fsck_flags {
 	x(dirent_stray_data_after_cf_name,			305,	0)		\
 	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\
 	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\
-	x(MAX,							319,	0)
+	x(MAX,							320,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,

From 09b9c72bd4b77a954123997377665fb30f1d07e1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 28 May 2025 11:57:50 -0400
Subject: [PATCH 53/69] bcachefs: bch_err_throw()

Add a tracepoint for any time we return an error and unwind.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c            | 10 ++--
 fs/bcachefs/alloc_foreground.c            | 23 ++++----
 fs/bcachefs/backpointers.c                | 17 +++---
 fs/bcachefs/backpointers.h                |  2 +-
 fs/bcachefs/bcachefs.h                    | 72 +++++++++++++----------
 fs/bcachefs/btree_cache.c                 | 24 ++++----
 fs/bcachefs/btree_gc.c                    |  8 +--
 fs/bcachefs/btree_io.c                    | 20 +++----
 fs/bcachefs/btree_iter.c                  |  4 +-
 fs/bcachefs/btree_journal_iter.c          |  6 +-
 fs/bcachefs/btree_key_cache.c             | 12 ++--
 fs/bcachefs/btree_trans_commit.c          | 15 ++---
 fs/bcachefs/btree_update.c                |  2 +-
 fs/bcachefs/btree_update_interior.c       | 10 ++--
 fs/bcachefs/btree_write_buffer.c          |  6 +-
 fs/bcachefs/buckets.c                     | 38 ++++++------
 fs/bcachefs/buckets_waiting_for_journal.c |  3 +-
 fs/bcachefs/chardev.c                     |  2 +-
 fs/bcachefs/checksum.c                    |  8 +--
 fs/bcachefs/compress.c                    | 20 +++----
 fs/bcachefs/data_update.c                 | 20 +++----
 fs/bcachefs/dirent.c                      |  4 +-
 fs/bcachefs/disk_accounting.c             | 10 ++--
 fs/bcachefs/disk_groups.c                 |  2 +-
 fs/bcachefs/ec.c                          | 63 ++++++++++----------
 fs/bcachefs/error.c                       | 58 +++++++++---------
 fs/bcachefs/error.h                       |  2 +-
 fs/bcachefs/extents.c                     | 12 ++--
 fs/bcachefs/fs-io-pagecache.c             |  2 +-
 fs/bcachefs/fs-ioctl.c                    |  4 +-
 fs/bcachefs/fs.c                          |  4 +-
 fs/bcachefs/fsck.c                        | 35 +++++------
 fs/bcachefs/inode.c                       |  6 +-
 fs/bcachefs/io_misc.c                     |  2 +-
 fs/bcachefs/io_read.c                     | 32 +++++-----
 fs/bcachefs/io_read.h                     |  6 +-
 fs/bcachefs/io_write.c                    | 19 +++---
 fs/bcachefs/journal.c                     | 52 ++++++++--------
 fs/bcachefs/journal_io.c                  | 22 +++----
 fs/bcachefs/journal_reclaim.c             |  4 +-
 fs/bcachefs/journal_sb.c                  |  2 +-
 fs/bcachefs/journal_seq_blacklist.c       |  4 +-
 fs/bcachefs/migrate.c                     |  4 +-
 fs/bcachefs/move.c                        |  2 +-
 fs/bcachefs/namei.c                       |  4 +-
 fs/bcachefs/quota.c                       |  6 +-
 fs/bcachefs/rebalance.c                   |  4 +-
 fs/bcachefs/recovery.c                    |  2 +-
 fs/bcachefs/recovery_passes.c             |  4 +-
 fs/bcachefs/reflink.c                     |  6 +-
 fs/bcachefs/replicas.c                    | 14 ++---
 fs/bcachefs/sb-downgrade.c                |  2 +-
 fs/bcachefs/sb-members.c                  |  2 +-
 fs/bcachefs/snapshot.c                    | 12 ++--
 fs/bcachefs/str_hash.c                    |  2 +-
 fs/bcachefs/str_hash.h                    |  7 ++-
 fs/bcachefs/subvolume.c                   |  7 ++-
 fs/bcachefs/super-io.c                    |  8 +--
 fs/bcachefs/super.c                       | 48 +++++++--------
 fs/bcachefs/trace.h                       | 63 ++++++++++++++------
 60 files changed, 459 insertions(+), 405 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e1231b20faec..b228a5a64479 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -21,7 +21,6 @@
 #include "error.h"
 #include "lru.h"
 #include "recovery.h"
-#include "trace.h"
 #include "varint.h"
 
 #include <linux/kthread.h>
@@ -866,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 
 	struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
 	if (!ca)
-		return -BCH_ERR_trigger_alloc;
+		return bch_err_throw(c, trigger_alloc);
 
 	struct bch_alloc_v4 old_a_convert;
 	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
@@ -1045,7 +1044,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 invalid_bucket:
 	bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
 			     (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
-	ret = -BCH_ERR_trigger_alloc;
+	ret = bch_err_throw(c, trigger_alloc);
 	goto err;
 }
 
@@ -1459,7 +1458,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
 		ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
 			bch2_trans_commit(trans, NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc) ?:
-			-BCH_ERR_transaction_restart_commit;
+			bch_err_throw(c, transaction_restart_commit);
 		goto out;
 	} else {
 		/*
@@ -1782,13 +1781,14 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 
 static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
 {
+	struct bch_fs *c = ca->fs;
 	int ret;
 
 	mutex_lock(&ca->discard_buckets_in_flight_lock);
 	struct discard_in_flight *i =
 		darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
 	if (i) {
-		ret = -BCH_ERR_EEXIST_discard_in_flight_add;
+		ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index e157bc86b2f3..b375ad610acd 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -227,7 +227,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
 
 		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
 		spin_unlock(&c->freelist_lock);
-		return ERR_PTR(-BCH_ERR_open_buckets_empty);
+		return ERR_PTR(bch_err_throw(c, open_buckets_empty));
 	}
 
 	/* Recheck under lock: */
@@ -533,7 +533,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 
 		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
 
-		ob = ERR_PTR(-BCH_ERR_freelist_empty);
+		ob = ERR_PTR(bch_err_throw(c, freelist_empty));
 		goto err;
 	}
 
@@ -558,7 +558,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 	}
 err:
 	if (!ob)
-		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+		ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
 
 	if (!IS_ERR(ob))
 		ob->data_type = req->data_type;
@@ -709,7 +709,7 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 				       struct closure *cl)
 {
 	struct bch_fs *c = trans->c;
-	int ret = -BCH_ERR_insufficient_devices;
+	int ret = 0;
 
 	BUG_ON(req->nr_effective >= req->nr_replicas);
 
@@ -738,13 +738,16 @@ inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
 			continue;
 		}
 
-		if (add_new_bucket(c, req, ob)) {
-			ret = 0;
+		ret = add_new_bucket(c, req, ob);
+		if (ret)
 			break;
-		}
 	}
 
-	return ret;
+	if (ret == 1)
+		return 0;
+	if (ret)
+		return ret;
+	return bch_err_throw(c, insufficient_devices);
 }
 
 /* Allocate from stripes: */
@@ -1373,11 +1376,11 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 		goto retry;
 
 	if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-		ret = -BCH_ERR_bucket_alloc_blocked;
+		ret = bch_err_throw(c, bucket_alloc_blocked);
 
 	if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
 	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
-		ret = -BCH_ERR_bucket_alloc_blocked;
+		ret = bch_err_throw(c, bucket_alloc_blocked);
 
 	return ret;
 }
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ebc8ee2cd33e..e76809e71858 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -142,7 +142,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 	}
 
 	if (!will_check && __bch2_inconsistent_error(c, &buf))
-		ret = -BCH_ERR_erofs_unfixed_errors;
+		ret = bch_err_throw(c, erofs_unfixed_errors);
 
 	bch_err(c, "%s", buf.buf);
 	printbuf_exit(&buf);
@@ -295,7 +295,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
 		return b;
 
 	if (btree_node_will_make_reachable(b)) {
-		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+		b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
 	} else {
 		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
 						       last_flushed, commit);
@@ -353,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
 		return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
 	} else {
 		struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
-		if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
+		if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)))
 			return bkey_s_c_null;
 		if (IS_ERR_OR_NULL(b))
 			return ((struct bkey_s_c) { .k = ERR_CAST(b) });
@@ -651,7 +651,7 @@ static int check_bp_exists(struct btree_trans *trans,
 	prt_newline(&buf);
 	bch2_bkey_val_to_text(&buf, c, other_extent);
 	bch_err(c, "%s", buf.buf);
-	ret = -BCH_ERR_fsck_repair_unimplemented;
+	ret = bch_err_throw(c, fsck_repair_unimplemented);
 	goto err;
 missing:
 	printbuf_reset(&buf);
@@ -953,7 +953,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
 		    sectors[ALLOC_cached] > a->cached_sectors ||
 		    sectors[ALLOC_stripe] > a->stripe_sectors) {
 			ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
-				-BCH_ERR_transaction_restart_nested;
+				bch_err_throw(c, transaction_restart_nested);
 			goto err;
 		}
 
@@ -1351,7 +1351,7 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u
 			b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
 					      sizeof(unsigned long), GFP_KERNEL);
 			if (!b->buckets)
-				return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+				return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
 		}
 
 		b->nr += !__test_and_set_bit(bit, b->buckets);
@@ -1360,7 +1360,8 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u
 	return 0;
 }
 
-int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size)
+int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
+			      u64 old_size, u64 new_size)
 {
 	scoped_guard(mutex, &b->lock) {
 		if (!b->buckets)
@@ -1369,7 +1370,7 @@ int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_siz
 		unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
 					    sizeof(unsigned long), GFP_KERNEL);
 		if (!n)
-			return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
+			return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
 
 		memcpy(n, b->buckets,
 		       BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index fac05948da1c..7e71afee1ac0 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -194,7 +194,7 @@ static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
 	return bitmap && test_bit(i, bitmap);
 }
 
-int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64);
+int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
 void bch2_bucket_bitmap_free(struct bucket_bitmap *);
 
 #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7824da2af9d0..3651a296d506 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -183,6 +183,16 @@
 #define pr_fmt(fmt) "%s() " fmt "\n", __func__
 #endif
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define ENUMERATED_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...)		0
+#endif
+
+#define race_fault(...)			dynamic_fault("bcachefs:race")
+
 #include <linux/backing-dev-defs.h>
 #include <linux/bug.h>
 #include <linux/bio.h>
@@ -219,15 +229,30 @@
 #include "time_stats.h"
 #include "util.h"
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define ENUMERATED_REF_DEBUG
-#endif
+#include "alloc_types.h"
+#include "async_objs_types.h"
+#include "btree_gc_types.h"
+#include "btree_types.h"
+#include "btree_node_scan_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
+#include "enumerated_ref_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "recovery_passes_types.h"
+#include "replicas_types.h"
+#include "sb-members_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+#include "thread_with_file_types.h"
 
-#ifndef dynamic_fault
-#define dynamic_fault(...)		0
-#endif
-
-#define race_fault(...)			dynamic_fault("bcachefs:race")
+#include "trace.h"
 
 #define count_event(_c, _name)	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
 
@@ -380,6 +405,14 @@ do {									\
 		pr_info(fmt, ##__VA_ARGS__);				\
 } while (0)
 
+static inline int __bch2_err_trace(struct bch_fs *c, int err)
+{
+	trace_error_throw(c, err, _THIS_IP_);
+	return err;
+}
+
+#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
+
 /* Parameters that are useful for debugging, but should always be compiled in: */
 #define BCH_DEBUG_PARAMS_ALWAYS()					\
 	BCH_DEBUG_PARAM(key_merging_disabled,				\
@@ -486,29 +519,6 @@ enum bch_time_stats {
 	BCH_TIME_STAT_NR
 };
 
-#include "alloc_types.h"
-#include "async_objs_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "enumerated_ref_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "recovery_passes_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES		4U
 
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8557cbd3d818..91e0aa796e6b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -149,7 +149,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 
 	b->data = kvmalloc(btree_buf_bytes(b), gfp);
 	if (!b->data)
-		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+		return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
 #ifdef __KERNEL__
 	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
 #else
@@ -162,7 +162,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	if (!b->aux_data) {
 		kvfree(b->data);
 		b->data = NULL;
-		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+		return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
 	}
 
 	return 0;
@@ -353,21 +353,21 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
 
 	if (btree_node_noevict(b)) {
 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 	}
 	if (btree_node_write_blocked(b)) {
 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 	}
 	if (btree_node_will_make_reachable(b)) {
 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 	}
 
 	if (btree_node_dirty(b)) {
 		if (!flush) {
 			bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
-			return -BCH_ERR_ENOMEM_btree_node_reclaim;
+			return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 		}
 
 		if (locked) {
@@ -393,7 +393,7 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
 				bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
 			else if (btree_node_write_in_flight(b))
 				bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
-			return -BCH_ERR_ENOMEM_btree_node_reclaim;
+			return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 		}
 
 		if (locked)
@@ -424,13 +424,13 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 
 	if (!six_trylock_intent(&b->c.lock)) {
 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 	}
 
 	if (!six_trylock_write(&b->c.lock)) {
 		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
 		six_unlock_intent(&b->c.lock);
-		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
 	}
 
 	/* recheck under lock */
@@ -682,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 
 	return 0;
 err:
-	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+	return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
 }
 
 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
@@ -727,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
 
 	if (!cl) {
 		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+		return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
 	}
 
 	closure_wait(&bc->alloc_wait, cl);
@@ -741,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
 	}
 
 	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+	return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
 
 success:
 	trace_and_count(c, btree_cache_cannibalize_lock, trans);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f95486729ba5..9ddcbe1bda78 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -150,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 
 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
-		return -BCH_ERR_ENOMEM_gc_repair_key;
+		return bch_err_throw(c, ENOMEM_gc_repair_key);
 
 	btree_ptr_to_v2(b, new);
 	b->data->min_key	= new_min;
@@ -190,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 
 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
-		return -BCH_ERR_ENOMEM_gc_repair_key;
+		return bch_err_throw(c, ENOMEM_gc_repair_key);
 
 	btree_ptr_to_v2(b, new);
 	b->data->max_key	= new_max;
@@ -935,7 +935,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
 		ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
 		if (ret) {
 			bch2_dev_put(ca);
-			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
 			break;
 		}
 	}
@@ -1180,7 +1180,7 @@ int bch2_gc_gens(struct bch_fs *c)
 		ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
 		if (!ca->oldest_gen) {
 			bch2_dev_put(ca);
-			ret = -BCH_ERR_ENOMEM_gc_gens;
+			ret = bch_err_throw(c, ENOMEM_gc_gens);
 			goto err;
 		}
 
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2e191561d578..57eff3012a7b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -557,7 +557,7 @@ static int __btree_err(int ret,
 		       const char *fmt, ...)
 {
 	if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
-		return -BCH_ERR_fsck_fix;
+		return bch_err_throw(c, fsck_fix);
 
 	bool have_retry = false;
 	int ret2;
@@ -572,9 +572,9 @@ static int __btree_err(int ret,
 	}
 
 	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
-		ret = -BCH_ERR_btree_node_read_err_fixable;
+		ret = bch_err_throw(c, btree_node_read_err_fixable);
 	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
-		ret = -BCH_ERR_btree_node_read_err_bad_node;
+		ret = bch_err_throw(c, btree_node_read_err_bad_node);
 
 	bch2_sb_error_count(c, err_type);
 
@@ -609,7 +609,7 @@ static int __btree_err(int ret,
 			}
 
 			if (!have_retry)
-				ret = -BCH_ERR_fsck_fix;
+				ret = bch_err_throw(c, fsck_fix);
 			goto out;
 		case -BCH_ERR_btree_node_read_err_bad_node:
 			prt_str(&out, ", ");
@@ -638,7 +638,7 @@ static int __btree_err(int ret,
 		}
 
 		if (!have_retry)
-			ret = -BCH_ERR_fsck_fix;
+			ret = bch_err_throw(c, fsck_fix);
 		goto out;
 	case -BCH_ERR_btree_node_read_err_bad_node:
 		prt_str(&out, ", ");
@@ -1687,7 +1687,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 	if (!ra)
-		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+		return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
 
 	closure_init(&ra->cl, NULL);
 	ra->c	= c;
@@ -1869,7 +1869,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
 		bch2_btree_node_hash_remove(&c->btree_cache, b);
 		mutex_unlock(&c->btree_cache.lock);
 
-		ret = -BCH_ERR_btree_node_read_error;
+		ret = bch_err_throw(c, btree_node_read_error);
 		goto err;
 	}
 
@@ -2019,7 +2019,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 
 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
-		return -BCH_ERR_erofs_no_writes;
+		return bch_err_throw(c, erofs_no_writes);
 
 	struct extent_ptr_decoded pick;
 	int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
@@ -2029,7 +2029,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
 	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
 						BCH_DEV_READ_REF_btree_node_scrub);
 	if (!ca) {
-		ret = -BCH_ERR_device_offline;
+		ret = bch_err_throw(c, device_offline);
 		goto err;
 	}
 
@@ -2166,7 +2166,7 @@ static void btree_node_write_work(struct work_struct *work)
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
 	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-		ret = -BCH_ERR_btree_node_write_all_failed;
+		ret = bch_err_throw(c, btree_node_write_all_failed);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c7606e0b113d..b78403376c07 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -938,7 +938,7 @@ static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
 
 	bch2_fs_fatal_error(c, "%s", buf.buf);
 	printbuf_exit(&buf);
-	return -BCH_ERR_btree_need_topology_repair;
+	return bch_err_throw(c, btree_need_topology_repair);
 }
 
 static __always_inline int btree_path_down(struct btree_trans *trans,
@@ -1006,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 	int ret = 0;
 
 	if (trans->in_traverse_all)
-		return -BCH_ERR_transaction_restart_in_traverse_all;
+		return bch_err_throw(c, transaction_restart_in_traverse_all);
 
 	trans->in_traverse_all = true;
 retry_all:
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 48863e6925e0..cf7398751644 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -292,7 +292,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		if (!new_keys.data) {
 			bch_err(c, "%s: error allocating new key array (size %zu)",
 				__func__, new_keys.size);
-			return -BCH_ERR_ENOMEM_journal_key_insert;
+			return bch_err_throw(c, ENOMEM_journal_key_insert);
 		}
 
 		/* Since @keys was full, there was no gap: */
@@ -331,7 +331,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
 
 	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
 	if (!n)
-		return -BCH_ERR_ENOMEM_journal_key_insert;
+		return bch_err_throw(c, ENOMEM_journal_key_insert);
 
 	bkey_copy(n, k);
 	ret = bch2_journal_key_insert_take(c, id, level, n);
@@ -736,7 +736,7 @@ int bch2_journal_keys_sort(struct bch_fs *c)
 				if (keys->nr * 8 > keys->size * 7) {
 					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
 						keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
-					return -BCH_ERR_ENOMEM_journal_keys_sort;
+					return bch_err_throw(c, ENOMEM_journal_keys_sort);
 				}
 
 				BUG_ON(darray_push(keys, n));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index e954b19756c0..d96188b92db2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -238,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
 		if (unlikely(!ck)) {
 			bch_err(c, "error allocating memory for key cache item, btree %s",
 				bch2_btree_id_str(ck_path->btree_id));
-			return -BCH_ERR_ENOMEM_btree_key_cache_create;
+			return bch_err_throw(c, ENOMEM_btree_key_cache_create);
 		}
 	}
 
@@ -256,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
 		if (unlikely(!new_k)) {
 			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 				bch2_btree_id_str(ck->key.btree_id), key_u64s);
-			ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+			ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
 		} else if (ret) {
 			kfree(new_k);
 			goto err;
@@ -822,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 	bc->nr_pending = alloc_percpu(size_t);
 	if (!bc->nr_pending)
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
 
 	if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
 	    rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
 
 	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
 
 	bc->table_init_done = true;
 
 	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
 	if (!shrink)
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
 	bc->shrink = shrink;
 	shrink->count_objects	= bch2_btree_key_cache_count;
 	shrink->scan_objects	= bch2_btree_key_cache_scan;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index f2d1edc9163c..d9710801e3ee 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -376,7 +376,7 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
 				       struct btree *b, unsigned u64s)
 {
 	if (!bch2_btree_node_insert_fits(b, u64s))
-		return -BCH_ERR_btree_insert_btree_node_full;
+		return bch_err_throw(trans->c, btree_insert_btree_node_full);
 
 	return 0;
 }
@@ -394,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
 
 	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
 	if (!new_k) {
-		bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+		struct bch_fs *c = trans->c;
+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
 			bch2_btree_id_str(path->btree_id), new_u64s);
-		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+		return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
 	}
 
 	ret =   bch2_trans_relock(trans) ?:
@@ -432,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
 	if (watermark < BCH_WATERMARK_reclaim &&
 	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 	    bch2_btree_key_cache_must_wait(c))
-		return -BCH_ERR_btree_insert_need_journal_reclaim;
+		return bch_err_throw(c, btree_insert_need_journal_reclaim);
 
 	/*
 	 * bch2_varint_decode can read past the end of the buffer by at most 7
@@ -894,7 +895,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		 */
 		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
 		    watermark < BCH_WATERMARK_reclaim) {
-			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			ret = bch_err_throw(c, journal_reclaim_would_deadlock);
 			goto out;
 		}
 
@@ -1024,7 +1025,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 		if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
 			ret = do_bch2_trans_commit_to_journal_replay(trans);
 		else
-			ret = -BCH_ERR_erofs_trans_commit;
+			ret = bch_err_throw(c, erofs_trans_commit);
 		goto out_reset;
 	}
 
@@ -1106,7 +1107,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 	 * restart:
 	 */
 	if (flags & BCH_TRANS_COMMIT_no_journal_res) {
-		ret = -BCH_ERR_transaction_restart_nested;
+		ret = bch_err_throw(c, transaction_restart_nested);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index e04508da5f7b..e97e78c10f49 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -587,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
 	BUG_ON(k.k->type != KEY_TYPE_deleted);
 
 	if (bkey_gt(k.k->p, end)) {
-		ret = -BCH_ERR_ENOSPC_btree_slot;
+		ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 647b40efd27f..e3639008be5c 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1244,7 +1244,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		if (bch2_err_matches(ret, ENOSPC) &&
 		    (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
 		    watermark < BCH_WATERMARK_reclaim) {
-			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			ret = bch_err_throw(c, journal_reclaim_would_deadlock);
 			goto err;
 		}
 
@@ -2177,7 +2177,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
 	if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
 		/* node has been freed: */
 		BUG_ON(!btree_node_dying(b));
-		ret = -BCH_ERR_btree_node_dying;
+		ret = bch_err_throw(trans->c, btree_node_dying);
 		goto err;
 	}
 
@@ -2791,16 +2791,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 	c->btree_interior_update_worker =
 		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
 	if (!c->btree_interior_update_worker)
-		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+		return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
 
 	c->btree_node_rewrite_worker =
 		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
 	if (!c->btree_node_rewrite_worker)
-		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+		return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
 
 	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 				      sizeof(struct btree_update)))
-		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+		return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
 
 	return 0;
 }
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index efb0c64d0aac..90b21e61d2b6 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -394,7 +394,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 		bool accounting_accumulated = false;
 		do {
 			if (race_fault()) {
-				ret = -BCH_ERR_journal_reclaim_would_deadlock;
+				ret = bch_err_throw(c, journal_reclaim_would_deadlock);
 				break;
 			}
 
@@ -633,7 +633,7 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
 	struct bch_fs *c = trans->c;
 
 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
-		return -BCH_ERR_erofs_no_writes;
+		return bch_err_throw(c, erofs_no_writes);
 
 	int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
@@ -676,7 +676,7 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
 			goto err;
 
 		bch2_bkey_buf_copy(last_flushed, c, tmp.k);
-		ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+		ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
 	}
 err:
 	bch2_bkey_buf_exit(&tmp, c);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 56bd8f66295c..f25903c10e8a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -224,7 +224,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
 			switch (g->data_type) {
 			case BCH_DATA_sb:
 				bch_err(c, "btree and superblock in the same bucket - cannot repair");
-				ret = -BCH_ERR_fsck_repair_unimplemented;
+				ret = bch_err_throw(c, fsck_repair_unimplemented);
 				goto out;
 			case BCH_DATA_journal:
 				ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
@@ -440,7 +440,7 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf
 		 * us an error code for rewinding recovery
 		 */
 		if (!ret)
-			ret = -BCH_ERR_bucket_ref_update;
+			ret = bch_err_throw(c, bucket_ref_update);
 	} else {
 		/* Always ignore overwrite errors, so that deletion works */
 		ret = 0;
@@ -632,7 +632,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
 	if (unlikely(!ca)) {
 		if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
-			ret = -BCH_ERR_trigger_pointer;
+			ret = bch_err_throw(c, trigger_pointer);
 		goto err;
 	}
 
@@ -640,7 +640,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	if (!bucket_valid(ca, bucket.offset)) {
 		if (insert) {
 			bch2_dev_bucket_missing(ca, bucket.offset);
-			ret = -BCH_ERR_trigger_pointer;
+			ret = bch_err_throw(c, trigger_pointer);
 		}
 		goto err;
 	}
@@ -662,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
 					    p.ptr.dev,
 					    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			ret = -BCH_ERR_trigger_pointer;
+			ret = bch_err_throw(c, trigger_pointer);
 			goto err;
 		}
 
@@ -688,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 				s64 sectors,
 				enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
+
 	if (flags & BTREE_TRIGGER_transactional) {
 		struct btree_iter iter;
 		struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
@@ -705,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 			bch2_trans_inconsistent(trans,
 				"stripe pointer doesn't match stripe %llu",
 				(u64) p.ec.idx);
-			ret = -BCH_ERR_trigger_stripe_pointer;
+			ret = bch_err_throw(c, trigger_stripe_pointer);
 			goto err;
 		}
 
@@ -725,13 +727,11 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 	}
 
 	if (flags & BTREE_TRIGGER_gc) {
-		struct bch_fs *c = trans->c;
-
 		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
 		if (!m) {
 			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
 				(u64) p.ec.idx);
-			return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+			return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
 		}
 
 		gc_stripe_lock(m);
@@ -746,7 +746,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
 			__bch2_inconsistent_error(c, &buf);
 			bch2_print_str(c, KERN_ERR, buf.buf);
 			printbuf_exit(&buf);
-			return -BCH_ERR_trigger_stripe_pointer;
+			return bch_err_throw(c, trigger_stripe_pointer);
 		}
 
 		m->block_sectors[p.ec.block] += sectors;
@@ -1014,7 +1014,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		bch2_print_str(c, KERN_ERR, buf.buf);
 		printbuf_exit(&buf);
 		if (!ret)
-			ret = -BCH_ERR_metadata_bucket_inconsistency;
+			ret = bch_err_throw(c, metadata_bucket_inconsistency);
 		goto err;
 	}
 
@@ -1067,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
 err_unlock:
 	bucket_unlock(g);
 err:
-	return -BCH_ERR_metadata_bucket_inconsistency;
+	return bch_err_throw(c, metadata_bucket_inconsistency);
 }
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1282,7 +1282,7 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 		ret = 0;
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
-		ret = -BCH_ERR_ENOSPC_disk_reservation;
+		ret = bch_err_throw(c, ENOSPC_disk_reservation);
 	}
 
 	mutex_unlock(&c->sectors_available_lock);
@@ -1311,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
 					    GFP_KERNEL|__GFP_ZERO);
 		if (!ca->buckets_nouse) {
 			bch2_dev_put(ca);
-			return -BCH_ERR_ENOMEM_buckets_nouse;
+			return bch_err_throw(c, ENOMEM_buckets_nouse);
 		}
 	}
 
@@ -1336,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		lockdep_assert_held(&c->state_lock);
 
 	if (resize && ca->buckets_nouse)
-		return -BCH_ERR_no_resize_with_buckets_nouse;
+		return bch_err_throw(c, no_resize_with_buckets_nouse);
 
 	bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
 				    GFP_KERNEL|__GFP_ZERO);
 	if (!bucket_gens) {
-		ret = -BCH_ERR_ENOMEM_bucket_gens;
+		ret = bch_err_throw(c, ENOMEM_bucket_gens);
 		goto err;
 	}
 
@@ -1360,9 +1360,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 		       sizeof(bucket_gens->b[0]) * copy);
 	}
 
-	ret =   bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch,
+	ret =   bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
 					  ca->mi.nbuckets, nbuckets) ?:
-		bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty,
+		bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
 					  ca->mi.nbuckets, nbuckets);
 
 	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
@@ -1389,7 +1389,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
 	ca->usage = alloc_percpu(struct bch_dev_usage_full);
 	if (!ca->usage)
-		return -BCH_ERR_ENOMEM_usage_init;
+		return bch_err_throw(c, ENOMEM_usage_init);
 
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
 }
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index c8a488e6b7b8..832eff93acb6 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -108,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 realloc:
 	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
 	if (!n) {
-		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+		struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
+		ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 2db30eb134f3..2d38466eddfd 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -618,7 +618,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
 		if (ca->dev == dev)
 			return ca->dev_idx;
 
-	return -BCH_ERR_ENOENT_dev_idx_not_found;
+	return bch_err_throw(c, ENOENT_dev_idx_not_found);
 }
 
 static long bch2_ioctl_disk_resize(struct bch_fs *c,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index ff5ab8ada777..3b5dc1233518 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
 
 	if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
 				    c, "attempting to encrypt without encryption key"))
-		return -BCH_ERR_no_encryption_key;
+		return bch_err_throw(c, no_encryption_key);
 
 	bch2_chacha20(&c->chacha20_key, nonce, data, len);
 	return 0;
@@ -262,7 +262,7 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 
 	if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
 				    c, "attempting to encrypt without encryption key"))
-		return -BCH_ERR_no_encryption_key;
+		return bch_err_throw(c, no_encryption_key);
 
 	bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce);
 
@@ -375,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 		prt_str(&buf, ")");
 		WARN_RATELIMIT(1, "%s", buf.buf);
 		printbuf_exit(&buf);
-		return -BCH_ERR_recompute_checksum;
+		return bch_err_throw(c, recompute_checksum);
 	}
 
 	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
@@ -659,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
 				     sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
-		ret = -BCH_ERR_ENOSPC_sb_crypt;
+		ret = bch_err_throw(c, ENOSPC_sb_crypt);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 1bca61d17092..b37b1f325f0a 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 			     __bch2_compression_types[crc.compression_type]))
 			ret = bch2_check_set_has_compressed_data(c, opt);
 		else
-			ret = -BCH_ERR_compression_workspace_not_initialized;
+			ret = bch_err_throw(c, compression_workspace_not_initialized);
 		if (ret)
 			goto err;
 	}
@@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
 						   src_len, dst_len, dst_len);
 		if (ret2 != dst_len)
-			ret = -BCH_ERR_decompress_lz4;
+			ret = bch_err_throw(c, decompress_lz4);
 		break;
 	case BCH_COMPRESSION_TYPE_gzip: {
 		z_stream strm = {
@@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		mempool_free(workspace, workspace_pool);
 
 		if (ret2 != Z_STREAM_END)
-			ret = -BCH_ERR_decompress_gzip;
+			ret = bch_err_throw(c, decompress_gzip);
 		break;
 	}
 	case BCH_COMPRESSION_TYPE_zstd: {
@@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		size_t real_src_len = le32_to_cpup(src_data.b);
 
 		if (real_src_len > src_len - 4) {
-			ret = -BCH_ERR_decompress_zstd_src_len_bad;
+			ret = bch_err_throw(c, decompress_zstd_src_len_bad);
 			goto err;
 		}
 
@@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
 		mempool_free(workspace, workspace_pool);
 
 		if (ret2 != dst_len)
-			ret = -BCH_ERR_decompress_zstd;
+			ret = bch_err_throw(c, decompress_zstd);
 		break;
 	}
 	default:
@@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
 		bch2_write_op_error(op, op->pos.offset,
 				    "extent too big to decompress (%u > %u)",
 				    crc->uncompressed_size << 9, c->opts.encoded_extent_max);
-		return -BCH_ERR_decompress_exceeded_max_encoded_extent;
+		return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
 	}
 
 	data = __bounce_alloc(c, dst_len, WRITE);
@@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 
 	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
 	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
-		return -BCH_ERR_decompress_exceeded_max_encoded_extent;
+		return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
 
 	dst_data = dst_len == dst_iter.bi_size
 		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
@@ -656,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 	if (!mempool_initialized(&c->compression_bounce[READ]) &&
 	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
 				       1, c->opts.encoded_extent_max))
-		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+		return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
 	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
 				       1, c->opts.encoded_extent_max))
-		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+		return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
 
 	for (i = compression_types;
 	     i < compression_types + ARRAY_SIZE(compression_types);
@@ -675,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		if (mempool_init_kvmalloc_pool(
 				&c->compress_workspace[i->type],
 				1, i->compress_workspace))
-			return -BCH_ERR_ENOMEM_compression_workspace_init;
+			return bch_err_throw(c, ENOMEM_compression_workspace_init);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 5c687ed1bcb2..92035859e588 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -255,7 +255,7 @@ static int data_update_invalid_bkey(struct data_update *m,
 	bch2_print_str(c, KERN_ERR, buf.buf);
 	printbuf_exit(&buf);
 
-	return -BCH_ERR_invalid_bkey;
+	return bch_err_throw(c, invalid_bkey);
 }
 
 static int __bch2_data_update_index_update(struct btree_trans *trans,
@@ -772,7 +772,7 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 {
 	if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
 	    unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
-		return -BCH_ERR_data_update_done_would_block;
+		return bch_err_throw(c, data_update_done_would_block);
 
 	unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
 		? m->op.target
@@ -802,9 +802,9 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 	}
 
 	if (!nr_replicas)
-		return -BCH_ERR_data_update_done_no_rw_devs;
+		return bch_err_throw(c, data_update_done_no_rw_devs);
 	if (nr_replicas < m->op.nr_replicas)
-		return -BCH_ERR_insufficient_devices;
+		return bch_err_throw(c, insufficient_devices);
 	return 0;
 }
 
@@ -829,14 +829,14 @@ int bch2_data_update_init(struct btree_trans *trans,
 		 */
 		if (unlikely(test_bit(BCH_FS_in_recovery, &c->flags) &&
 			     bch2_snapshot_id_state(c, k.k->p.snapshot) == SNAPSHOT_ID_empty))
-			return -BCH_ERR_data_update_done_no_snapshot;
+			return bch_err_throw(c, data_update_done_no_snapshot);
 
 		ret = bch2_check_key_has_snapshot(trans, iter, k);
 		if (ret < 0)
 			return ret;
 		if (ret) /* key was deleted */
 			return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-				-BCH_ERR_data_update_done_no_snapshot;
+				bch_err_throw(c, data_update_done_no_snapshot);
 		ret = 0;
 	}
 
@@ -943,7 +943,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (iter)
 			ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
 		if (!ret)
-			ret = -BCH_ERR_data_update_done_no_writes_needed;
+			ret = bch_err_throw(c, data_update_done_no_writes_needed);
 		goto out_bkey_buf_exit;
 	}
 
@@ -974,19 +974,19 @@ int bch2_data_update_init(struct btree_trans *trans,
 	}
 
 	if (!bkey_get_dev_refs(c, k)) {
-		ret = -BCH_ERR_data_update_done_no_dev_refs;
+		ret = bch_err_throw(c, data_update_done_no_dev_refs);
 		goto out_put_disk_res;
 	}
 
 	if (c->opts.nocow_enabled &&
 	    !bkey_nocow_lock(c, ctxt, ptrs)) {
-		ret = -BCH_ERR_nocow_lock_blocked;
+		ret = bch_err_throw(c, nocow_lock_blocked);
 		goto out_put_dev_refs;
 	}
 
 	if (unwritten) {
 		ret = bch2_update_unwritten_extent(trans, m) ?:
-			-BCH_ERR_data_update_done_unwritten;
+			bch_err_throw(c, data_update_done_unwritten);
 		goto out_nocow_unlock;
 	}
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 066134145721..62bb88250861 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -648,7 +648,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
 			struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 			if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
 				continue;
-			ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
+			ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
 			break;
 		}
 	bch2_trans_iter_exit(trans, &iter);
@@ -737,7 +737,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 		ret = bch2_inode_unpack(k, inode);
 		goto found;
 	}
-	ret = -BCH_ERR_ENOENT_inode;
+	ret = bch_err_throw(trans->c, ENOENT_inode);
 found:
 	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
 	bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 6e4a68263dfb..3d59a57a5256 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -390,7 +390,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
 err:
 	free_percpu(n.v[1]);
 	free_percpu(n.v[0]);
-	return -BCH_ERR_ENOMEM_disk_accounting;
+	return bch_err_throw(c, ENOMEM_disk_accounting);
 }
 
 int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
@@ -401,7 +401,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
 	if (mode != BCH_ACCOUNTING_read &&
 	    accounting_to_replicas(&r.e, a.k->p) &&
 	    !bch2_replicas_marked_locked(c, &r.e))
-		return -BCH_ERR_btree_insert_need_mark_replicas;
+		return bch_err_throw(c, btree_insert_need_mark_replicas);
 
 	percpu_up_read(&c->mark_lock);
 	percpu_down_write(&c->mark_lock);
@@ -419,7 +419,7 @@ int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounti
 	if (mode != BCH_ACCOUNTING_read &&
 	    accounting_to_replicas(&r.e, a.k->p) &&
 	    !bch2_replicas_marked_locked(c, &r.e))
-		return -BCH_ERR_btree_insert_need_mark_replicas;
+		return bch_err_throw(c, btree_insert_need_mark_replicas);
 
 	return __bch2_accounting_mem_insert(c, a);
 }
@@ -559,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c)
 					     sizeof(u64), GFP_KERNEL);
 		if (!e->v[1]) {
 			bch2_accounting_free_counters(acc, true);
-			ret = -BCH_ERR_ENOMEM_disk_accounting;
+			ret = bch_err_throw(c, ENOMEM_disk_accounting);
 			break;
 		}
 	}
@@ -737,7 +737,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
 				bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
 			-BCH_ERR_remove_disk_accounting_entry;
 	} else {
-		ret = -BCH_ERR_remove_disk_accounting_entry;
+		ret = bch_err_throw(c, remove_disk_accounting_entry);
 	}
 	goto fsck_err;
 }
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 9a17ef78f647..cde842ac1886 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -130,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
 	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
 	if (!cpu_g)
-		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+		return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
 
 	cpu_g->nr = nr_groups;
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0a5f3a2e45a2..543dbba9b14f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -213,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
 				a->dirty_sectors,
 				a->stripe, s.k->p.offset,
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 			goto err;
 		}
 
@@ -224,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
 				a->dirty_sectors,
 				a->cached_sectors,
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 			goto err;
 		}
 	} else {
@@ -234,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
 				bucket.inode, bucket.offset, a->gen,
 				a->stripe,
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 			goto err;
 		}
 
@@ -244,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
 				bch2_data_type_str(a->data_type),
 				bch2_data_type_str(data_type),
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 			goto err;
 		}
 
@@ -256,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
 				a->dirty_sectors,
 				a->cached_sectors,
 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 			goto err;
 		}
 	}
@@ -295,7 +295,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
 	if (unlikely(!ca)) {
 		if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 		goto err;
 	}
 
@@ -325,7 +325,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
 		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
 					    ptr->dev,
 					    (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = -BCH_ERR_mark_stripe;
+			ret = bch_err_throw(c, mark_stripe);
 			goto err;
 		}
 
@@ -428,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
 			if (!gc) {
 				bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
-				return -BCH_ERR_ENOMEM_mark_stripe;
+				return bch_err_throw(c, ENOMEM_mark_stripe);
 			}
 
 			/*
@@ -536,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 }
 
 /* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+static int ec_stripe_buf_init(struct bch_fs *c,
+			      struct ec_stripe_buf *buf,
 			      unsigned offset, unsigned size)
 {
 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
@@ -564,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 	return 0;
 err:
 	ec_stripe_buf_exit(buf);
-	return -BCH_ERR_ENOMEM_stripe_buf;
+	return bch_err_throw(c, ENOMEM_stripe_buf);
 }
 
 /* Checksumming: */
@@ -840,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
 
 	buf = kzalloc(sizeof(*buf), GFP_NOFS);
 	if (!buf)
-		return -BCH_ERR_ENOMEM_ec_read_extent;
+		return bch_err_throw(c, ENOMEM_ec_read_extent);
 
 	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
 	if (ret) {
@@ -861,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
 		goto err;
 	}
 
-	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+	ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
 	if (ret) {
 		msg = "-ENOMEM";
 		goto err;
@@ -894,7 +895,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
 	bch_err_ratelimited(c,
 			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
 	printbuf_exit(&msgbuf);
-	ret = -BCH_ERR_stripe_reconstruct;
+	ret = bch_err_throw(c, stripe_reconstruct);
 	goto out;
 }
 
@@ -904,7 +905,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
 {
 	if (c->gc_pos.phase != GC_PHASE_not_running &&
 	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
-		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+		return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
 
 	return 0;
 }
@@ -1129,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 
 		bch2_fs_inconsistent(c, "%s", buf.buf);
 		printbuf_exit(&buf);
-		return -BCH_ERR_erasure_coding_found_btree_node;
+		return bch_err_throw(c, erasure_coding_found_btree_node);
 	}
 
 	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1195,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
 	if (!ca)
-		return -BCH_ERR_ENOENT_dev_not_found;
+		return bch_err_throw(c, ENOENT_dev_not_found);
 
 	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
 
@@ -1256,7 +1257,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
 	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
 				BCH_DEV_WRITE_REF_ec_bucket_zero);
 	if (!ca) {
-		s->err = -BCH_ERR_erofs_no_writes;
+		s->err = bch_err_throw(c, erofs_no_writes);
 		return;
 	}
 
@@ -1320,7 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 		if (ec_do_recov(c, &s->existing_stripe)) {
 			bch_err(c, "error creating stripe: error reading existing stripe");
-			ret = -BCH_ERR_ec_block_read;
+			ret = bch_err_throw(c, ec_block_read);
 			goto err;
 		}
 
@@ -1346,7 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
 	if (ec_nr_failed(&s->new_stripe)) {
 		bch_err(c, "error creating stripe: error writing redundancy buckets");
-		ret = -BCH_ERR_ec_block_write;
+		ret = bch_err_throw(c, ec_block_write);
 		goto err;
 	}
 
@@ -1865,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new
 	s->nr_data = existing_v->nr_blocks -
 		existing_v->nr_redundant;
 
-	int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
+	int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
 	if (ret) {
 		bch2_stripe_close(c, s);
 		return ret;
@@ -1925,7 +1926,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 	}
 	bch2_trans_iter_exit(trans, &lru_iter);
 	if (!ret)
-		ret = -BCH_ERR_stripe_alloc_blocked;
+		ret = bch_err_throw(c, stripe_alloc_blocked);
 	if (ret == 1)
 		ret = 0;
 	if (ret)
@@ -1966,7 +1967,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
 				continue;
 			}
 
-			ret = -BCH_ERR_ENOSPC_stripe_create;
+			ret = bch_err_throw(c, ENOSPC_stripe_create);
 			break;
 		}
 
@@ -2024,7 +2025,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 	if (!h->s) {
 		h->s = ec_new_stripe_alloc(c, h);
 		if (!h->s) {
-			ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+			ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
 			bch_err(c, "failed to allocate new stripe");
 			goto err;
 		}
@@ -2089,7 +2090,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
 		goto err;
 
 allocate_buf:
-	ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
+	ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
 	if (ret)
 		goto err;
 
@@ -2115,6 +2116,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_stripe)
 		return 0;
 
+	struct bch_fs *c = trans->c;
 	struct bkey_i_stripe *s =
 		bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
 	int ret = PTR_ERR_OR_ZERO(s);
@@ -2146,17 +2148,17 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
 			if (ptr->dev == dev_idx)
 				ptr->dev = BCH_SB_MEMBER_INVALID;
 
-			struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev);
+			struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
 			nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
 		}
 
 	if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
-		return -BCH_ERR_remove_would_lose_data;
+		return bch_err_throw(c, remove_would_lose_data);
 
 	unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
 
 	if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
-		return -BCH_ERR_remove_would_lose_data;
+		return bch_err_throw(c, remove_would_lose_data);
 
 	sectors = -sectors;
 
@@ -2177,14 +2179,15 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s
 		return 0;
 
 	if (a->stripe_sectors) {
-		bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
-		return -BCH_ERR_invalidate_stripe_to_dev;
+		struct bch_fs *c = trans->c;
+		bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
+		return bch_err_throw(c, invalidate_stripe_to_dev);
 	}
 
 	struct btree_iter iter;
 	struct bkey_s_c_stripe s =
 		bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
-					BTREE_ITER_slots, stripe);
+					 BTREE_ITER_slots, stripe);
 	int ret = bkey_err(s);
 	if (ret)
 		return ret;
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 0488f966e1e4..63951e293c47 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -100,10 +100,10 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
 	set_bit(BCH_FS_topology_error, &c->flags);
 	if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
 		__bch2_inconsistent_error(c, out);
-		return -BCH_ERR_btree_need_topology_repair;
+		return bch_err_throw(c, btree_need_topology_repair);
 	} else {
 		return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
-			-BCH_ERR_btree_node_read_validate_error;
+			bch_err_throw(c, btree_node_read_validate_error);
 	}
 }
 
@@ -403,23 +403,23 @@ int bch2_fsck_err_opt(struct bch_fs *c,
 
 	if (test_bit(BCH_FS_in_fsck, &c->flags)) {
 		if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
-			return -BCH_ERR_fsck_repair_unimplemented;
+			return bch_err_throw(c, fsck_repair_unimplemented);
 
 		switch (c->opts.fix_errors) {
 		case FSCK_FIX_exit:
-			return -BCH_ERR_fsck_errors_not_fixed;
+			return bch_err_throw(c, fsck_errors_not_fixed);
 		case FSCK_FIX_yes:
 			if (flags & FSCK_CAN_FIX)
-				return -BCH_ERR_fsck_fix;
+				return bch_err_throw(c, fsck_fix);
 			fallthrough;
 		case FSCK_FIX_no:
 			if (flags & FSCK_CAN_IGNORE)
-				return -BCH_ERR_fsck_ignore;
-			return -BCH_ERR_fsck_errors_not_fixed;
+				return bch_err_throw(c, fsck_ignore);
+			return bch_err_throw(c, fsck_errors_not_fixed);
 		case FSCK_FIX_ask:
 			if (flags & FSCK_AUTOFIX)
-				return -BCH_ERR_fsck_fix;
-			return -BCH_ERR_fsck_ask;
+				return bch_err_throw(c, fsck_fix);
+			return bch_err_throw(c, fsck_ask);
 		default:
 			BUG();
 		}
@@ -427,12 +427,12 @@ int bch2_fsck_err_opt(struct bch_fs *c,
 		if ((flags & FSCK_AUTOFIX) &&
 		    (c->opts.errors == BCH_ON_ERROR_continue ||
 		     c->opts.errors == BCH_ON_ERROR_fix_safe))
-			return -BCH_ERR_fsck_fix;
+			return bch_err_throw(c, fsck_fix);
 
 		if (c->opts.errors == BCH_ON_ERROR_continue &&
 		    (flags & FSCK_CAN_IGNORE))
-			return -BCH_ERR_fsck_ignore;
-		return -BCH_ERR_fsck_errors_not_fixed;
+			return bch_err_throw(c, fsck_ignore);
+		return bch_err_throw(c, fsck_errors_not_fixed);
 	}
 }
 
@@ -474,8 +474,8 @@ int __bch2_fsck_err(struct bch_fs *c,
 
 	if (test_bit(err, c->sb.errors_silent))
 		return flags & FSCK_CAN_FIX
-			? -BCH_ERR_fsck_fix
-			: -BCH_ERR_fsck_ignore;
+			? bch_err_throw(c, fsck_fix)
+			: bch_err_throw(c, fsck_ignore);
 
 	printbuf_indent_add_nextline(out, 2);
 
@@ -517,10 +517,10 @@ int __bch2_fsck_err(struct bch_fs *c,
 		prt_str(out, ", ");
 		if (flags & FSCK_CAN_FIX) {
 			prt_actioning(out, action);
-			ret = -BCH_ERR_fsck_fix;
+			ret = bch_err_throw(c, fsck_fix);
 		} else {
 			prt_str(out, ", continuing");
-			ret = -BCH_ERR_fsck_ignore;
+			ret = bch_err_throw(c, fsck_ignore);
 		}
 
 		goto print;
@@ -532,18 +532,18 @@ int __bch2_fsck_err(struct bch_fs *c,
 					 "run fsck, and forward to devs so error can be marked for self-healing");
 			inconsistent = true;
 			print = true;
-			ret = -BCH_ERR_fsck_errors_not_fixed;
+			ret = bch_err_throw(c, fsck_errors_not_fixed);
 		} else if (flags & FSCK_CAN_FIX) {
 			prt_str(out, ", ");
 			prt_actioning(out, action);
-			ret = -BCH_ERR_fsck_fix;
+			ret = bch_err_throw(c, fsck_fix);
 		} else {
 			prt_str(out, ", continuing");
-			ret = -BCH_ERR_fsck_ignore;
+			ret = bch_err_throw(c, fsck_ignore);
 		}
 	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
 		prt_str(out, ", exiting");
-		ret = -BCH_ERR_fsck_errors_not_fixed;
+		ret = bch_err_throw(c, fsck_errors_not_fixed);
 	} else if (flags & FSCK_CAN_FIX) {
 		int fix = s && s->fix
 			? s->fix
@@ -562,33 +562,33 @@ int __bch2_fsck_err(struct bch_fs *c,
 					: FSCK_FIX_yes;
 
 			ret = ret & 1
-				? -BCH_ERR_fsck_fix
-				: -BCH_ERR_fsck_ignore;
+				? bch_err_throw(c, fsck_fix)
+				: bch_err_throw(c, fsck_ignore);
 		} else if (fix == FSCK_FIX_yes ||
 			   (c->opts.nochanges &&
 			    !(flags & FSCK_CAN_IGNORE))) {
 			prt_str(out, ", ");
 			prt_actioning(out, action);
-			ret = -BCH_ERR_fsck_fix;
+			ret = bch_err_throw(c, fsck_fix);
 		} else {
 			prt_str(out, ", not ");
 			prt_actioning(out, action);
-			ret = -BCH_ERR_fsck_ignore;
+			ret = bch_err_throw(c, fsck_ignore);
 		}
 	} else {
 		if (flags & FSCK_CAN_IGNORE) {
 			prt_str(out, ", continuing");
-			ret = -BCH_ERR_fsck_ignore;
+			ret = bch_err_throw(c, fsck_ignore);
 		} else {
 			prt_str(out, " (repair unimplemented)");
-			ret = -BCH_ERR_fsck_repair_unimplemented;
+			ret = bch_err_throw(c, fsck_repair_unimplemented);
 		}
 	}
 
 	if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
 	    (c->opts.fix_errors == FSCK_FIX_exit ||
 	     !(flags & FSCK_CAN_IGNORE)))
-		ret = -BCH_ERR_fsck_errors_not_fixed;
+		ret = bch_err_throw(c, fsck_errors_not_fixed);
 
 	if (test_bit(BCH_FS_in_fsck, &c->flags) &&
 	    (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
@@ -657,12 +657,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
 			 const char *fmt, ...)
 {
 	if (from.flags & BCH_VALIDATE_silent)
-		return -BCH_ERR_fsck_delete_bkey;
+		return bch_err_throw(c, fsck_delete_bkey);
 
 	unsigned fsck_flags = 0;
 	if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
 		if (test_bit(err, c->sb.errors_silent))
-			return -BCH_ERR_fsck_delete_bkey;
+			return bch_err_throw(c, fsck_delete_bkey);
 
 		fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
 	}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 4babb0d12bb2..0c3c3a24fc6f 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -173,7 +173,7 @@ do {									\
 	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&		\
 	    !bch2_err_matches(_ret, BCH_ERR_fsck_ignore))		\
 		ret = _ret;						\
-	ret = -BCH_ERR_fsck_delete_bkey;				\
+	ret = bch_err_throw(c, fsck_delete_bkey);			\
 	goto fsck_err;							\
 } while (0)
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 677cf453b332..036e4ad95987 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -193,7 +193,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 	bool have_dirty_ptrs = false, have_pick = false;
 
 	if (k.k->type == KEY_TYPE_error)
-		return -BCH_ERR_key_type_error;
+		return bch_err_throw(c, key_type_error);
 
 	rcu_read_lock();
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -286,17 +286,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 	if (!have_dirty_ptrs)
 		return 0;
 	if (have_missing_devs)
-		return -BCH_ERR_no_device_to_read_from;
+		return bch_err_throw(c, no_device_to_read_from);
 	if (have_csum_errors)
-		return -BCH_ERR_data_read_csum_err;
+		return bch_err_throw(c, data_read_csum_err);
 	if (have_io_errors)
-		return -BCH_ERR_data_read_io_err;
+		return bch_err_throw(c, data_read_io_err);
 
 	/*
 	 * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
 	 * but they don't point to valid devices:
 	 */
-	return -BCH_ERR_no_devices_valid;
+	return bch_err_throw(c, no_devices_valid);
 }
 
 /* KEY_TYPE_btree_ptr: */
@@ -1515,7 +1515,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
 				prt_printf(err, "invalid compression opt %u:%u",
 					   opt.type, opt.level);
-				return -BCH_ERR_invalid_bkey;
+				return bch_err_throw(c, invalid_bkey);
 			}
 #endif
 			break;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index fbae9c1de746..c2cc405822f2 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c,
 
 					if (!reserved) {
 						bch2_disk_reservation_put(c, &disk_res);
-						return -BCH_ERR_ENOSPC_disk_reservation;
+						return bch_err_throw(c, ENOSPC_disk_reservation);
 					}
 					break;
 				}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 05361a793206..4e72e654da96 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -268,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
 	}
 
 	if (dst_dentry->d_inode) {
-		error = -BCH_ERR_EEXIST_subvolume_create;
+		error = bch_err_throw(c, EEXIST_subvolume_create);
 		goto err3;
 	}
 
 	dir = dst_path.dentry->d_inode;
 	if (IS_DEADDIR(dir)) {
-		error = -BCH_ERR_ENOENT_directory_dead;
+		error = bch_err_throw(c, ENOENT_directory_dead);
 		goto err3;
 	}
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ebf967e82d8a..41008d4a2b99 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2007,14 +2007,14 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 			goto err;
 
 		if (k.k->type != KEY_TYPE_dirent) {
-			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+			ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
 			goto err;
 		}
 
 		d = bkey_s_c_to_dirent(k);
 		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
 		if (ret > 0)
-			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+			ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 66b7a87a375b..a7bf949df816 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -23,14 +23,15 @@
 #include <linux/bsearch.h>
 #include <linux/dcache.h> /* struct qstr */
 
-static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+static int dirent_points_to_inode_nowarn(struct bch_fs *c,
+					 struct bkey_s_c_dirent d,
 					 struct bch_inode_unpacked *inode)
 {
 	if (d.v->d_type == DT_SUBVOL
 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
 	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
 		return 0;
-	return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+	return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
 }
 
 static void dirent_inode_mismatch_msg(struct printbuf *out,
@@ -49,7 +50,7 @@ static int dirent_points_to_inode(struct bch_fs *c,
 				  struct bkey_s_c_dirent dirent,
 				  struct bch_inode_unpacked *inode)
 {
-	int ret = dirent_points_to_inode_nowarn(dirent, inode);
+	int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
 	if (ret) {
 		struct printbuf buf = PRINTBUF;
 		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
@@ -152,7 +153,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans,
 			goto found;
 		}
 	}
-	ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+	ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
 found:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -229,7 +230,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 
 	if (d_type != DT_DIR) {
 		bch_err(c, "error looking up lost+found: not a directory");
-		return -BCH_ERR_ENOENT_not_directory;
+		return bch_err_throw(c, ENOENT_not_directory);
 	}
 
 	/*
@@ -531,7 +532,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
 
 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
-		return -BCH_ERR_fsck_repair_unimplemented;
+		return bch_err_throw(c, fsck_repair_unimplemented);
 	}
 
 	/*
@@ -939,7 +940,7 @@ lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, str
 		if (ret)
 			goto fsck_err;
 
-		ret = -BCH_ERR_transaction_restart_nested;
+		ret = bch_err_throw(c, transaction_restart_nested);
 		goto fsck_err;
 	}
 
@@ -1041,7 +1042,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
+	if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
 	    inode->bi_subvol &&
 	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
 		/* Older version of a renamed subvolume root: we won't have a
@@ -1062,7 +1063,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
 			trans, inode_points_to_missing_dirent,
 			"inode points to missing dirent\n%s",
 			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
-	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
+	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
 			trans, inode_points_to_wrong_dirent,
 			"%s",
 			(printbuf_reset(&buf),
@@ -1453,7 +1454,7 @@ static int check_key_has_inode(struct btree_trans *trans,
 			goto err;
 
 		inode->last_pos.inode--;
-		ret = -BCH_ERR_transaction_restart_nested;
+		ret = bch_err_throw(c, transaction_restart_nested);
 		goto err;
 	}
 
@@ -1570,7 +1571,7 @@ static int extent_ends_at(struct bch_fs *c,
 			      sizeof(seen->ids.data[0]) * seen->ids.size,
 			      GFP_KERNEL);
 	if (!n.seen.ids.data)
-		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+		return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
 
 	__darray_for_each(extent_ends->e, i) {
 		if (i->snapshot == k.k->p.snapshot) {
@@ -1620,7 +1621,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
 
 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
 			__func__, buf.buf);
-		ret = -BCH_ERR_internal_fsck_err;
+		ret = bch_err_throw(c, internal_fsck_err);
 		goto err;
 	}
 
@@ -1645,7 +1646,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
 	    pos2.size != k2.k->size) {
 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
 			__func__, buf.buf);
-		ret = -BCH_ERR_internal_fsck_err;
+		ret = bch_err_throw(c, internal_fsck_err);
 		goto err;
 	}
 
@@ -1693,7 +1694,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
 			 * We overwrote the second extent - restart
 			 * check_extent() from the top:
 			 */
-			ret = -BCH_ERR_transaction_restart_nested;
+			ret = bch_err_throw(c, transaction_restart_nested);
 		}
 	}
 fsck_err:
@@ -2046,7 +2047,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
 		if (!new_parent_subvol) {
 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
-			return -BCH_ERR_fsck_repair_unimplemented;
+			return bch_err_throw(c, fsck_repair_unimplemented);
 		}
 
 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
@@ -2108,7 +2109,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 
 	if (ret) {
 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
-		ret = -BCH_ERR_fsck_repair_unimplemented;
+		ret = bch_err_throw(c, fsck_repair_unimplemented);
 		goto err;
 	}
 
@@ -2749,7 +2750,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
 		if (!d) {
 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
 				new_size);
-			return -BCH_ERR_ENOMEM_fsck_add_nlink;
+			return bch_err_throw(c, ENOMEM_fsck_add_nlink);
 		}
 
 		if (t->d)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 5b35dae89c13..391daae5c171 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1041,7 +1041,7 @@ int bch2_inode_create(struct btree_trans *trans,
 		goto found_slot;
 
 	if (!ret && start == min)
-		ret = -BCH_ERR_ENOSPC_inode_create;
+		ret = bch_err_throw(trans->c, ENOSPC_inode_create);
 
 	if (ret) {
 		bch2_trans_iter_exit(trans, iter);
@@ -1161,7 +1161,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 		bch2_fs_inconsistent(c,
 				     "inode %llu:%u not found when deleting",
 				     inum.inum, snapshot);
-		ret = -BCH_ERR_ENOENT_inode;
+		ret = bch_err_throw(c, ENOENT_inode);
 		goto err;
 	}
 
@@ -1328,7 +1328,7 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum
 		bch2_fs_inconsistent(c,
 				     "inode %llu:%u not found when deleting",
 				     inum, snapshot);
-		ret = -BCH_ERR_ENOENT_inode;
+		ret = bch_err_throw(c, ENOENT_inode);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index cc07729a4b62..bf72b1d2e2cb 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
 				opts.data_replicas,
 				BCH_WATERMARK_normal, 0, &cl, &wp);
 		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-			ret = -BCH_ERR_transaction_restart_nested;
+			ret = bch_err_throw(c, transaction_restart_nested);
 		if (ret)
 			goto err;
 
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index fb83c6942485..a77779afad01 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -137,21 +137,21 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
 		BUG_ON(!opts.promote_target);
 
 		if (!(flags & BCH_READ_may_promote))
-			return -BCH_ERR_nopromote_may_not;
+			return bch_err_throw(c, nopromote_may_not);
 
 		if (bch2_bkey_has_target(c, k, opts.promote_target))
-			return -BCH_ERR_nopromote_already_promoted;
+			return bch_err_throw(c, nopromote_already_promoted);
 
 		if (bkey_extent_is_unwritten(k))
-			return -BCH_ERR_nopromote_unwritten;
+			return bch_err_throw(c, nopromote_unwritten);
 
 		if (bch2_target_congested(c, opts.promote_target))
-			return -BCH_ERR_nopromote_congested;
+			return bch_err_throw(c, nopromote_congested);
 	}
 
 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
 				   bch_promote_params))
-		return -BCH_ERR_nopromote_in_flight;
+		return bch_err_throw(c, nopromote_in_flight);
 
 	return 0;
 }
@@ -239,7 +239,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
 
 	struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
 	if (!op) {
-		ret = -BCH_ERR_nopromote_enomem;
+		ret = bch_err_throw(c, nopromote_enomem);
 		goto err_put;
 	}
 
@@ -248,7 +248,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
 
 	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
 					  bch_promote_params)) {
-		ret = -BCH_ERR_nopromote_in_flight;
+		ret = bch_err_throw(c, nopromote_in_flight);
 		goto err;
 	}
 
@@ -544,7 +544,7 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
 
 	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
 		/* extent we wanted to read no longer exists: */
-		rbio->ret = -BCH_ERR_data_read_key_overwritten;
+		rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
 		goto err;
 	}
 
@@ -1035,7 +1035,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 
 	if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
 	    !orig->data_update)
-		return -BCH_ERR_extent_poisoned;
+		return bch_err_throw(c, extent_poisoned);
 retry_pick:
 	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
 
@@ -1073,7 +1073,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 
 		bch_err_ratelimited(c, "%s", buf.buf);
 		printbuf_exit(&buf);
-		ret = -BCH_ERR_data_read_no_encryption_key;
+		ret = bch_err_throw(c, data_read_no_encryption_key);
 		goto err;
 	}
 
@@ -1127,7 +1127,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 			if (ca)
 				enumerated_ref_put(&ca->io_ref[READ],
 					BCH_DEV_READ_REF_io_read);
-			rbio->ret = -BCH_ERR_data_read_buffer_too_small;
+			rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
 			goto out_read_done;
 		}
 
@@ -1332,7 +1332,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 	 * have to signal that:
 	 */
 	if (u)
-		orig->ret = -BCH_ERR_data_read_key_overwritten;
+		orig->ret = bch_err_throw(c, data_read_key_overwritten);
 
 	zero_fill_bio_iter(&orig->bio, iter);
 out_read_done:
@@ -1509,18 +1509,18 @@ int bch2_fs_io_read_init(struct bch_fs *c)
 					 c->opts.btree_node_size,
 					 c->opts.encoded_extent_max) /
 				   PAGE_SIZE, 0))
-		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+		return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
 
 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
 			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_read_init;
+		return bch_err_throw(c, ENOMEM_bio_read_init);
 
 	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
 			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_bio_read_split_init;
+		return bch_err_throw(c, ENOMEM_bio_read_split_init);
 
 	if (rhashtable_init(&c->promote_table, &bch_promote_params))
-		return -BCH_ERR_ENOMEM_promote_table_init;
+		return bch_err_throw(c, ENOMEM_promote_table_init);
 
 	return 0;
 }
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index c08b9c047b3e..45c959018919 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -91,6 +91,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
 		return 0;
 
 	*data_btree = BTREE_ID_reflink;
+
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
 						offset_into_extent,
@@ -102,10 +104,10 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
 
 	if (bkey_deleted(k.k)) {
 		bch2_trans_iter_exit(trans, &iter);
-		return -BCH_ERR_missing_indirect_extent;
+		return bch_err_throw(c, missing_indirect_extent);
 	}
 
-	bch2_bkey_buf_reassemble(extent, trans->c, k);
+	bch2_bkey_buf_reassemble(extent, c, k);
 	bch2_trans_iter_exit(trans, &iter);
 	return 0;
 }
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index b9d624e8f099..88b1eec8eff3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -558,6 +558,7 @@ static void bch2_write_done(struct closure *cl)
 
 static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 {
+	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_i *src, *dst = keys->keys, *n;
 
@@ -569,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 					    test_bit(ptr->dev, op->failed.d));
 
 			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
-				return -BCH_ERR_data_write_io;
+				return bch_err_throw(c, data_write_io);
 		}
 
 		if (dst != src)
@@ -976,7 +977,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct
 		op->crc.csum_type < BCH_CSUM_NR
 		? __bch2_csum_types[op->crc.csum_type]
 		: "(unknown)");
-	return -BCH_ERR_data_write_csum;
+	return bch_err_throw(c, data_write_csum);
 }
 
 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -1287,7 +1288,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 static void __bch2_nocow_write_done(struct bch_write_op *op)
 {
 	if (unlikely(op->flags & BCH_WRITE_io_error)) {
-		op->error = -BCH_ERR_data_write_io;
+		op->error = bch_err_throw(op->c, data_write_io);
 	} else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
 		bch2_nocow_write_convert_unwritten(op);
 }
@@ -1480,10 +1481,10 @@ static void bch2_nocow_write(struct bch_write_op *op)
 				    "pointer to invalid bucket in nocow path on device %llu\n  %s",
 				    stale_at->b.inode,
 				    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = -BCH_ERR_data_write_invalid_ptr;
+		ret = bch_err_throw(c, data_write_invalid_ptr);
 	} else {
 		/* We can retry this: */
-		ret = -BCH_ERR_transaction_restart;
+		ret = bch_err_throw(c, transaction_restart);
 	}
 	printbuf_exit(&buf);
 
@@ -1690,18 +1691,18 @@ CLOSURE_CALLBACK(bch2_write)
 
 	if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
 		bch2_write_op_error(op, op->pos.offset, "misaligned write");
-		op->error = -BCH_ERR_data_write_misaligned;
+		op->error = bch_err_throw(c, data_write_misaligned);
 		goto err;
 	}
 
 	if (c->opts.nochanges) {
-		op->error = -BCH_ERR_erofs_no_writes;
+		op->error = bch_err_throw(c, erofs_no_writes);
 		goto err;
 	}
 
 	if (!(op->flags & BCH_WRITE_move) &&
 	    !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
-		op->error = -BCH_ERR_erofs_no_writes;
+		op->error = bch_err_throw(c, erofs_no_writes);
 		goto err;
 	}
 
@@ -1773,7 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c)
 {
 	if (bioset_init(&c->bio_write,   1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
 	    bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
-		return -BCH_ERR_ENOMEM_bio_write_init;
+		return bch_err_throw(c, ENOMEM_bio_write_init);
 
 	return 0;
 }
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0348ab3276f4..dda802a656cf 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -397,7 +397,7 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 
 	if (j->blocked)
-		return -BCH_ERR_journal_blocked;
+		return bch_err_throw(c, journal_blocked);
 
 	if (j->cur_entry_error)
 		return j->cur_entry_error;
@@ -407,23 +407,23 @@ static int journal_entry_open(struct journal *j)
 		return ret;
 
 	if (!fifo_free(&j->pin))
-		return -BCH_ERR_journal_pin_full;
+		return bch_err_throw(c, journal_pin_full);
 
 	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
-		return -BCH_ERR_journal_max_in_flight;
+		return bch_err_throw(c, journal_max_in_flight);
 
 	if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
-		return -BCH_ERR_journal_max_open;
+		return bch_err_throw(c, journal_max_open);
 
 	if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
 		bch_err(c, "cannot start: journal seq overflow");
 		if (bch2_fs_emergency_read_only_locked(c))
 			bch_err(c, "fatal error - emergency read only");
-		return -BCH_ERR_journal_shutdown;
+		return bch_err_throw(c, journal_shutdown);
 	}
 
 	if (!j->free_buf && !buf->data)
-		return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
+		return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
 
 	BUG_ON(!j->cur_entry_sectors);
 
@@ -447,7 +447,7 @@ static int journal_entry_open(struct journal *j)
 	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
 	if (u64s <= (ssize_t) j->early_journal_entries.nr)
-		return -BCH_ERR_journal_full;
+		return bch_err_throw(c, journal_full);
 
 	if (fifo_empty(&j->pin) && j->reclaim_thread)
 		wake_up_process(j->reclaim_thread);
@@ -464,7 +464,7 @@ static int journal_entry_open(struct journal *j)
 			journal_cur_seq(j));
 		if (bch2_fs_emergency_read_only_locked(c))
 			bch_err(c, "fatal error - emergency read only");
-		return -BCH_ERR_journal_shutdown;
+		return bch_err_throw(c, journal_shutdown);
 	}
 
 	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
@@ -597,16 +597,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 		return ret;
 
 	if (j->blocked)
-		return -BCH_ERR_journal_blocked;
+		return bch_err_throw(c, journal_blocked);
 
 	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-		ret = -BCH_ERR_journal_full;
+		ret = bch_err_throw(c, journal_full);
 		can_discard = j->can_discard;
 		goto out;
 	}
 
 	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
-		ret = -BCH_ERR_journal_max_in_flight;
+		ret = bch_err_throw(c, journal_max_in_flight);
 		goto out;
 	}
 
@@ -647,7 +647,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
 		goto retry;
 
 	if (journal_error_check_stuck(j, ret, flags))
-		ret = -BCH_ERR_journal_stuck;
+		ret = bch_err_throw(c, journal_stuck);
 
 	if (ret == -BCH_ERR_journal_max_in_flight &&
 	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
@@ -812,6 +812,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 				 struct closure *parent)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
 	int ret = 0;
 
@@ -827,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
 	/* Recheck under lock: */
 	if (j->err_seq && seq >= j->err_seq) {
-		ret = -BCH_ERR_journal_flush_err;
+		ret = bch_err_throw(c, journal_flush_err);
 		goto out;
 	}
 
@@ -998,7 +999,7 @@ int bch2_journal_meta(struct journal *j)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
-		return -BCH_ERR_erofs_no_writes;
+		return bch_err_throw(c, erofs_no_writes);
 
 	int ret = __bch2_journal_meta(j);
 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
@@ -1131,7 +1132,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
 	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
-		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
 		goto err_free;
 	}
 
@@ -1322,7 +1323,7 @@ int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
 
 	u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
 	if (!new_buckets)
-		return -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
 
 	memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
 	memmove(&new_buckets[pos],
@@ -1372,14 +1373,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
 
 	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
 		bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
-		return -BCH_ERR_erofs_filesystem_full;
+		return bch_err_throw(c, erofs_filesystem_full);
 	}
 
 	unsigned nr;
 	int ret;
 
 	if (dynamic_fault("bcachefs:add:journal_alloc")) {
-		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
 		goto err;
 	}
 
@@ -1518,7 +1519,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
 	if (!j->pin.data) {
 		bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-		return -BCH_ERR_ENOMEM_journal_pin_fifo;
+		return bch_err_throw(c, ENOMEM_journal_pin_fifo);
 	}
 
 	j->replay_journal_seq	= last_seq;
@@ -1606,6 +1607,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca)
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 {
+	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets =
 		bch2_sb_field_get(sb, journal);
@@ -1625,7 +1627,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->bucket_seq)
-		return -BCH_ERR_ENOMEM_dev_journal_init;
+		return bch_err_throw(c, ENOMEM_dev_journal_init);
 
 	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
@@ -1633,7 +1635,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 		ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
 				     nr_bvecs), GFP_KERNEL);
 		if (!ja->bio[i])
-			return -BCH_ERR_ENOMEM_dev_journal_init;
+			return bch_err_throw(c, ENOMEM_dev_journal_init);
 
 		ja->bio[i]->ca = ca;
 		ja->bio[i]->buf_idx = i;
@@ -1642,7 +1644,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
-		return -BCH_ERR_ENOMEM_dev_journal_init;
+		return bch_err_throw(c, ENOMEM_dev_journal_init);
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
@@ -1696,10 +1698,12 @@ void bch2_fs_journal_init_early(struct journal *j)
 
 int bch2_fs_journal_init(struct journal *j)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
 	j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
 	j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
 	if (!j->free_buf)
-		return -BCH_ERR_ENOMEM_journal_buf;
+		return bch_err_throw(c, ENOMEM_journal_buf);
 
 	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
 		j->buf[i].idx = i;
@@ -1707,7 +1711,7 @@ int bch2_fs_journal_init(struct journal *j)
 	j->wq = alloc_workqueue("bcachefs_journal",
 				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
 	if (!j->wq)
-		return -BCH_ERR_ENOMEM_fs_other_alloc;
+		return bch_err_throw(c, ENOMEM_fs_other_alloc);
 	return 0;
 }
 
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8ce41753eadb..d4c9980a6a7b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -199,7 +199,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
 				GFP_KERNEL);
 	if (!_i)
-		return -BCH_ERR_ENOMEM_journal_entry_add;
+		return bch_err_throw(c, ENOMEM_journal_entry_add);
 
 	/*
 	 * Duplicate journal entries? If so we want the one that didn't have a
@@ -242,7 +242,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 replace:
 	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i)
-		return -BCH_ERR_ENOMEM_journal_entry_add;
+		return bch_err_throw(c, ENOMEM_journal_entry_add);
 
 	darray_init(&i->ptrs);
 	i->csum_good		= entry_ptr.csum_good;
@@ -322,7 +322,7 @@ static void journal_entry_err_msg(struct printbuf *out,
 		bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);		\
 		if (bch2_fs_inconsistent(c,				\
 				"corrupt metadata before write: %s\n", _buf.buf)) {\
-			ret = -BCH_ERR_fsck_errors_not_fixed;		\
+			ret = bch_err_throw(c, fsck_errors_not_fixed);		\
 			goto fsck_err;					\
 		}							\
 		break;							\
@@ -1020,19 +1020,19 @@ struct journal_read_buf {
 	size_t		size;
 };
 
-static int journal_read_buf_realloc(struct journal_read_buf *b,
+static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
 				    size_t new_size)
 {
 	void *n;
 
 	/* the bios are sized for this many pages, max: */
 	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
-		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+		return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
 
 	new_size = roundup_pow_of_two(new_size);
 	n = kvmalloc(new_size, GFP_KERNEL);
 	if (!n)
-		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+		return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
 
 	kvfree(b->data);
 	b->data = n;
@@ -1067,7 +1067,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 
 			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 			if (!bio)
-				return -BCH_ERR_ENOMEM_journal_read_bucket;
+				return bch_err_throw(c, ENOMEM_journal_read_bucket);
 			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
 
 			bio->bi_iter.bi_sector = offset;
@@ -1078,7 +1078,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 			kfree(bio);
 
 			if (!ret && bch2_meta_read_fault("journal"))
-				ret = -BCH_ERR_EIO_fault_injected;
+				ret = bch_err_throw(c, EIO_fault_injected);
 
 			bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
 						   submit_time, !ret);
@@ -1106,7 +1106,7 @@ static int journal_read_bucket(struct bch_dev *ca,
 			break;
 		case JOURNAL_ENTRY_REREAD:
 			if (vstruct_bytes(j) > buf->size) {
-				ret = journal_read_buf_realloc(buf,
+				ret = journal_read_buf_realloc(c, buf,
 							vstruct_bytes(j));
 				if (ret)
 					return ret;
@@ -1206,7 +1206,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 	if (!ja->nr)
 		goto out;
 
-	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
 	if (ret)
 		goto err;
 
@@ -1691,7 +1691,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 			       : j->noflush_write_time, j->write_start_time);
 
 	if (!w->devs_written.nr) {
-		err = -BCH_ERR_journal_write_err;
+		err = bch_err_throw(c, journal_write_err);
 	} else {
 		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
 					 w->devs_written);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 67be0c33f70c..cd6201741c59 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -226,7 +226,7 @@ void bch2_journal_space_available(struct journal *j)
 			bch_err(c, "%s", buf.buf);
 			printbuf_exit(&buf);
 		}
-		ret = -BCH_ERR_insufficient_journal_devices;
+		ret = bch_err_throw(c, insufficient_journal_devices);
 		goto out;
 	}
 
@@ -240,7 +240,7 @@ void bch2_journal_space_available(struct journal *j)
 	total		= j->space[journal_space_total].total;
 
 	if (!j->space[journal_space_discarded].next_entry)
-		ret = -BCH_ERR_journal_full;
+		ret = bch_err_throw(c, journal_full);
 
 	if ((j->space[journal_space_clean_ondisk].next_entry <
 	     j->space[journal_space_clean_ondisk].total) &&
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 62b910f2fb27..0cb9b93f13e7 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
 	j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
 			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
 	if (!j)
-		return -BCH_ERR_ENOSPC_sb_journal;
+		return bch_err_throw(c, ENOSPC_sb_journal);
 
 	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index c5a7d800a0f5..af4fe416d9ec 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
 				  sb_blacklist_u64s(nr + 1));
 	if (!bl) {
-		ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+		ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
 		goto out;
 	}
 
@@ -152,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 
 	t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
 	if (!t)
-		return -BCH_ERR_ENOMEM_blacklist_table_init;
+		return bch_err_throw(c, ENOMEM_blacklist_table_init);
 
 	t->nr = nr;
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index bb7a92270c09..f296cce95338 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -35,7 +35,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
-		return -BCH_ERR_remove_would_lose_data;
+		return bch_err_throw(c, remove_would_lose_data);
 
 	return 0;
 }
@@ -156,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c,
 
 	/* don't handle this yet: */
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
-		return -BCH_ERR_remove_with_metadata_missing_unimplemented;
+		return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
 
 	trans = bch2_trans_get(c);
 	bch2_bkey_buf_init(&k);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5dafc0018b0e..eec591e947bd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -971,7 +971,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
 		if (data_opts.scrub &&
 		    !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
 			bch2_trans_iter_exit(trans, &iter);
-			ret = -BCH_ERR_device_offline;
+			ret = bch_err_throw(c, device_offline);
 			break;
 		}
 
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index ee6fe35cec80..24120037c031 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -287,7 +287,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	}
 
 	if (deleting_subvol && !inode_u->bi_subvol) {
-		ret = -BCH_ERR_ENOENT_not_subvol;
+		ret = bch_err_throw(c, ENOENT_not_subvol);
 		goto err;
 	}
 
@@ -633,7 +633,7 @@ static int __bch2_inum_to_path(struct btree_trans *trans,
 			break;
 
 		if (!inode.bi_dir && !inode.bi_dir_offset) {
-			ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+			ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
 			goto disconnected;
 		}
 
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 3d4755d73af7..f241efb1fb50 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 	struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
 		mutex_unlock(&c->sb_lock);
-		return -BCH_ERR_ENOSPC_sb_quota;
+		return bch_err_throw(c, ENOSPC_sb_quota);
 	}
 
 	bch2_sb_quota_read(c);
@@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 	mutex_lock(&c->sb_lock);
 	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
-		ret = -BCH_ERR_ENOSPC_sb_quota;
+		ret = bch_err_throw(c, ENOSPC_sb_quota);
 		goto unlock;
 	}
 
@@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	mutex_lock(&c->sb_lock);
 	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 	if (!sb_quota) {
-		ret = -BCH_ERR_ENOSPC_sb_quota;
+		ret = bch_err_throw(c, ENOSPC_sb_quota);
 		goto unlock;
 	}
 
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c563797ec5d5..1c345b86b1c0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -443,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
 		if (bch2_err_matches(ret, ENOMEM)) {
 			/* memory allocation failure, wait for some IO to finish */
 			bch2_move_ctxt_wait_for_io(ctxt);
-			ret = -BCH_ERR_transaction_restart_nested;
+			ret = bch_err_throw(c, transaction_restart_nested);
 		}
 
 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -795,7 +795,7 @@ static int check_rebalance_work_one(struct btree_trans *trans,
 				     BTREE_ID_extents, POS_MIN,
 				     BTREE_ITER_prefetch|
 				     BTREE_ITER_all_snapshots);
-		return -BCH_ERR_transaction_restart_nested;
+		return bch_err_throw(c, transaction_restart_nested);
 	}
 
 	if (!extent_k.k && !rebalance_k.k)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 4b51105bdb2e..1e68e61f08e8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -879,7 +879,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 use_clean:
 		if (!clean) {
 			bch_err(c, "no superblock clean section found");
-			ret = -BCH_ERR_fsck_repair_impossible;
+			ret = bch_err_throw(c, fsck_repair_impossible);
 			goto err;
 
 		}
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 212658cb97dd..f6ddbab5ee82 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -329,7 +329,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	    (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
 		prt_printf(out, "need recovery pass %s (%u), but already rw\n",
 			   bch2_recovery_passes[pass], pass);
-		ret = -BCH_ERR_cannot_rewind_recovery;
+		ret = bch_err_throw(c, cannot_rewind_recovery);
 		goto out;
 	}
 
@@ -349,7 +349,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 		if (rewind) {
 			r->next_pass = pass;
 			r->passes_complete &= (1ULL << pass) >> 1;
-			ret = -BCH_ERR_restart_recovery;
+			ret = bch_err_throw(c, restart_recovery);
 		}
 	} else {
 		prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 41ca86cba2cd..a535abd44df3 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -312,7 +312,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
 
 	if (!bkey_refcount_c(k)) {
 		if (!(flags & BTREE_TRIGGER_overwrite))
-			ret = -BCH_ERR_missing_indirect_extent;
+			ret = bch_err_throw(c, missing_indirect_extent);
 		goto next;
 	}
 
@@ -612,7 +612,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 	int ret = 0, ret2 = 0;
 
 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
-		return -BCH_ERR_erofs_no_writes;
+		return bch_err_throw(c, erofs_no_writes);
 
 	bch2_check_set_feature(c, BCH_FEATURE_reflink);
 
@@ -848,7 +848,7 @@ int bch2_gc_reflink_start(struct bch_fs *c)
 			struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
 							c->reflink_gc_nr++, GFP_KERNEL);
 			if (!r) {
-				ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+				ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
 				break;
 			}
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 4698f8866cd2..8383bd7fdb3f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
 	return 0;
 bad:
 	bch2_replicas_entry_to_text(err, r);
-	return -BCH_ERR_invalid_replicas_entry;
+	return bch_err_throw(c, invalid_replicas_entry);
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
 		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
 		if (!new_gc.entries) {
-			ret = -BCH_ERR_ENOMEM_cpu_replicas;
+			ret = bch_err_throw(c, ENOMEM_cpu_replicas);
 			goto err;
 		}
 	}
@@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	if (!__replicas_has_entry(&c->replicas, new_entry)) {
 		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
 		if (!new_r.entries) {
-			ret = -BCH_ERR_ENOMEM_cpu_replicas;
+			ret = bch_err_throw(c, ENOMEM_cpu_replicas);
 			goto err;
 		}
 
@@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	if (!c->replicas_gc.entries) {
 		mutex_unlock(&c->sb_lock);
 		bch_err(c, "error allocating c->replicas_gc");
-		return -BCH_ERR_ENOMEM_replicas_gc;
+		return bch_err_throw(c, ENOMEM_replicas_gc);
 	}
 
 	for_each_cpu_replicas_entry(&c->replicas, e)
@@ -458,7 +458,7 @@ int bch2_replicas_gc2(struct bch_fs *c)
 	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
 	if (!new.entries) {
 		bch_err(c, "error allocating c->replicas_gc");
-		return -BCH_ERR_ENOMEM_replicas_gc;
+		return bch_err_throw(c, ENOMEM_replicas_gc);
 	}
 
 	mutex_lock(&c->sb_lock);
@@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
-		return -BCH_ERR_ENOSPC_sb_replicas;
+		return bch_err_throw(c, ENOSPC_sb_replicas);
 
 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
@@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
 	if (!sb_r)
-		return -BCH_ERR_ENOSPC_sb_replicas;
+		return bch_err_throw(c, ENOSPC_sb_replicas);
 
 	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
 	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 861fce1630f0..b61f88450a6d 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -417,7 +417,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
 
 	d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
 	if (!d) {
-		ret = -BCH_ERR_ENOSPC_sb_downgrade;
+		ret = bch_err_throw(c, ENOSPC_sb_downgrade);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index ab5673b34e1d..363eb0c6eb7c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -101,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
 
 		mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
 		if (!mi)
-			return -BCH_ERR_ENOSPC_sb_members_v2;
+			return bch_err_throw(c, ENOSPC_sb_members_v2);
 
 		for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
 			void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 612c526a94dd..24818d29982d 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -54,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
 					  BTREE_ITER_with_updates, snapshot_tree, s);
 
 	if (bch2_err_matches(ret, ENOENT))
-		ret = -BCH_ERR_ENOENT_snapshot_tree;
+		ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
 	return ret;
 }
 
@@ -67,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans)
 	struct bkey_i_snapshot_tree *s_t;
 
 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
-		ret = -BCH_ERR_ENOSPC_snapshot_tree;
+		ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -285,7 +285,7 @@ static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
 	mutex_lock(&c->snapshot_table_lock);
 	int ret = snapshot_t_mut(c, id)
 		? 0
-		: -BCH_ERR_ENOMEM_mark_snapshot;
+		: bch_err_throw(c, ENOMEM_mark_snapshot);
 	mutex_unlock(&c->snapshot_table_lock);
 	return ret;
 }
@@ -304,7 +304,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
 
 	t = snapshot_t_mut(c, id);
 	if (!t) {
-		ret = -BCH_ERR_ENOMEM_mark_snapshot;
+		ret = bch_err_throw(c, ENOMEM_mark_snapshot);
 		goto err;
 	}
 
@@ -1006,7 +1006,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
 				if (t->nr > 1) {
 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
-					ret = -BCH_ERR_fsck_repair_unimplemented;
+					ret = bch_err_throw(c, fsck_repair_unimplemented);
 					goto err;
 				}
 
@@ -1276,7 +1276,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
 			goto err;
 
 		if (!k.k || !k.k->p.offset) {
-			ret = -BCH_ERR_ENOSPC_snapshot_create;
+			ret = bch_err_throw(c, ENOSPC_snapshot_create);
 			goto err;
 		}
 
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 7b97852d1cdf..36d7ca7b0b1a 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -191,7 +191,7 @@ int bch2_repair_inode_hash_info(struct btree_trans *trans,
 #endif
 		bch2_print_str(c, KERN_ERR, buf.buf);
 		printbuf_exit(&buf);
-		ret = -BCH_ERR_fsck_repair_unimplemented;
+		ret = bch_err_throw(c, fsck_repair_unimplemented);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6762b3627e1b..ebcc006825cd 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -261,6 +261,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
 			   struct bkey_i *insert,
 			   enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter slot = {};
 	struct bkey_s_c k;
 	bool found = false;
@@ -288,7 +289,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
 	}
 
 	if (!ret)
-		ret = -BCH_ERR_ENOSPC_str_hash_create;
+		ret = bch_err_throw(c, ENOSPC_str_hash_create);
 out:
 	bch2_trans_iter_exit(trans, &slot);
 	bch2_trans_iter_exit(trans, iter);
@@ -300,7 +301,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
 		bch2_trans_iter_exit(trans, &slot);
 		return k;
 	} else if (!found && (flags & STR_HASH_must_replace)) {
-		ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+		ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
 	} else {
 		if (!found && slot.path)
 			swap(*iter, slot);
@@ -328,7 +329,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
 		return ret;
 	if (k.k) {
 		bch2_trans_iter_exit(trans, &iter);
-		return -BCH_ERR_EEXIST_str_hash_set;
+		return bch_err_throw(trans->c, EEXIST_str_hash_set);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 170c5f6b6ce7..e22837396bd9 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -593,7 +593,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 	ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
 				BTREE_ID_subvolumes, POS(0, U32_MAX));
 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
-		ret = -BCH_ERR_ENOSPC_subvolume_create;
+		ret = bch_err_throw(c, ENOSPC_subvolume_create);
 	if (ret)
 		return ret;
 
@@ -699,8 +699,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 		return ret;
 
 	if (!bkey_is_inode(k.k)) {
-		bch_err(trans->c, "root inode not found");
-		ret = -BCH_ERR_ENOENT_inode;
+		struct bch_fs *c = trans->c;
+		bch_err(c, "root inode not found");
+		ret = bch_err_throw(c, ENOENT_inode);
 		goto err;
 	}
 
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6687b9235d3c..6c2e1d647403 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1112,7 +1112,7 @@ int bch2_write_super(struct bch_fs *c)
 		prt_str(&buf, ")");
 		bch2_fs_fatal_error(c, ": %s", buf.buf);
 		printbuf_exit(&buf);
-		ret = -BCH_ERR_sb_not_downgraded;
+		ret = bch_err_throw(c, sb_not_downgraded);
 		goto out;
 	}
 
@@ -1142,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c)
 
 			if (c->opts.errors != BCH_ON_ERROR_continue &&
 			    c->opts.errors != BCH_ON_ERROR_fix_safe) {
-				ret = -BCH_ERR_erofs_sb_err;
+				ret = bch_err_throw(c, erofs_sb_err);
 				bch2_fs_fatal_error(c, "%s", buf.buf);
 			} else {
 				bch_err(c, "%s", buf.buf);
@@ -1161,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c)
 				ca->disk_sb.seq);
 			bch2_fs_fatal_error(c, "%s", buf.buf);
 			printbuf_exit(&buf);
-			ret = -BCH_ERR_erofs_sb_err;
+			ret = bch_err_throw(c, erofs_sb_err);
 		}
 	}
 
@@ -1215,7 +1215,7 @@ int bch2_write_super(struct bch_fs *c)
 				  !can_mount_with_written), c,
 		": Unable to write superblock to sufficient devices (from %ps)",
 		(void *) _RET_IP_))
-		ret = -BCH_ERR_erofs_sb_err;
+		ret = bch_err_throw(c, erofs_sb_err);
 out:
 	/* Make new options visible after they're persistent: */
 	bch2_sb_update(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 56ca2e4a415b..21b1b227f89f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -474,16 +474,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
 
 	if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
-		return -BCH_ERR_erofs_no_alloc_info;
+		return bch_err_throw(c, erofs_no_alloc_info);
 
 	if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
 		bch_err(c, "cannot go rw, unfixed btree errors");
-		return -BCH_ERR_erofs_unfixed_errors;
+		return bch_err_throw(c, erofs_unfixed_errors);
 	}
 
 	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
 		bch_err(c, "cannot go rw, filesystem is an unresized image file");
-		return -BCH_ERR_erofs_filesystem_full;
+		return bch_err_throw(c, erofs_filesystem_full);
 	}
 
 	if (test_bit(BCH_FS_rw, &c->flags))
@@ -564,13 +564,13 @@ int bch2_fs_read_write(struct bch_fs *c)
 {
 	if (c->opts.recovery_pass_last &&
 	    c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
-		return -BCH_ERR_erofs_norecovery;
+		return bch_err_throw(c, erofs_norecovery);
 
 	if (c->opts.nochanges)
-		return -BCH_ERR_erofs_nochanges;
+		return bch_err_throw(c, erofs_nochanges);
 
 	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
-		return -BCH_ERR_erofs_no_alloc_info;
+		return bch_err_throw(c, erofs_no_alloc_info);
 
 	return __bch2_fs_read_write(c, false);
 }
@@ -755,7 +755,7 @@ static int bch2_fs_online(struct bch_fs *c)
 	if (c->sb.multi_device &&
 	    __bch2_uuid_to_fs(c->sb.uuid)) {
 		bch_err(c, "filesystem UUID already open");
-		return -BCH_ERR_filesystem_uuid_already_open;
+		return bch_err_throw(c, filesystem_uuid_already_open);
 	}
 
 	ret = bch2_fs_chardev_init(c);
@@ -814,7 +814,7 @@ static int bch2_fs_init_rw(struct bch_fs *c)
 				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
 				WQ_FREEZABLE, 0)))
-		return -BCH_ERR_ENOMEM_fs_other_alloc;
+		return bch_err_throw(c, ENOMEM_fs_other_alloc);
 
 	int ret = bch2_fs_btree_interior_update_init(c) ?:
 		bch2_fs_btree_write_buffer_init(c) ?:
@@ -995,7 +995,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
 				       c->opts.btree_node_size) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
-		ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+		ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
 		goto err;
 	}
 
@@ -1155,7 +1155,7 @@ int bch2_fs_start(struct bch_fs *c)
 			 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
 
 	if (!bch2_fs_may_start(c))
-		return -BCH_ERR_insufficient_devices_to_start;
+		return bch_err_throw(c, insufficient_devices_to_start);
 
 	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);
@@ -1166,7 +1166,7 @@ int bch2_fs_start(struct bch_fs *c)
 			sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
 		mutex_unlock(&c->sb_lock);
 		up_write(&c->state_lock);
-		ret = -BCH_ERR_ENOSPC_sb;
+		ret = bch_err_throw(c, ENOSPC_sb);
 		goto err;
 	}
 
@@ -1208,7 +1208,7 @@ int bch2_fs_start(struct bch_fs *c)
 		goto err;
 
 	if (bch2_fs_init_fault("fs_start")) {
-		ret = -BCH_ERR_injected_fs_start;
+		ret = bch_err_throw(c, injected_fs_start);
 		goto err;
 	}
 
@@ -1235,11 +1235,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 
 	if (le16_to_cpu(sb->block_size) != block_sectors(c))
-		return -BCH_ERR_mismatched_block_size;
+		return bch_err_throw(c, mismatched_block_size);
 
 	if (le16_to_cpu(m.bucket_size) <
 	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
-		return -BCH_ERR_bucket_size_too_small;
+		return bch_err_throw(c, bucket_size_too_small);
 
 	return 0;
 }
@@ -1550,7 +1550,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	bch2_dev_attach(c, ca, dev_idx);
 	return 0;
 err:
-	return -BCH_ERR_ENOMEM_dev_alloc;
+	return bch_err_throw(c, ENOMEM_dev_alloc);
 }
 
 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
@@ -1560,13 +1560,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 	if (bch2_dev_is_online(ca)) {
 		bch_err(ca, "already have device online in slot %u",
 			sb->sb->dev_idx);
-		return -BCH_ERR_device_already_online;
+		return bch_err_throw(ca->fs, device_already_online);
 	}
 
 	if (get_capacity(sb->bdev->bd_disk) <
 	    ca->mi.bucket_size * ca->mi.nbuckets) {
 		bch_err(ca, "cannot online: device too small");
-		return -BCH_ERR_device_size_too_small;
+		return bch_err_throw(ca->fs, device_size_too_small);
 	}
 
 	BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
@@ -1718,7 +1718,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 		return 0;
 
 	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
-		return -BCH_ERR_device_state_not_allowed;
+		return bch_err_throw(c, device_state_not_allowed);
 
 	if (new_state != BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_only(c, ca);
@@ -1771,7 +1771,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot remove without losing data");
-		ret = -BCH_ERR_device_state_not_allowed;
+		ret = bch_err_throw(c, device_state_not_allowed);
 		goto err;
 	}
 
@@ -1907,7 +1907,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (list_empty(&c->list)) {
 		mutex_lock(&bch_fs_list_lock);
 		if (__bch2_uuid_to_fs(c->sb.uuid))
-			ret = -BCH_ERR_filesystem_uuid_already_open;
+			ret = bch_err_throw(c, filesystem_uuid_already_open);
 		else
 			list_add(&c->list, &bch_fs_list);
 		mutex_unlock(&bch_fs_list_lock);
@@ -2094,7 +2094,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
 		bch_err(ca, "Cannot offline required disk");
 		up_write(&c->state_lock);
-		return -BCH_ERR_device_state_not_allowed;
+		return bch_err_throw(c, device_state_not_allowed);
 	}
 
 	__bch2_dev_offline(c, ca);
@@ -2133,7 +2133,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
 		bch_err(ca, "New device size too big (%llu greater than max %u)",
 			nbuckets, BCH_MEMBER_NBUCKETS_MAX);
-		ret = -BCH_ERR_device_size_too_big;
+		ret = bch_err_throw(c, device_size_too_big);
 		goto err;
 	}
 
@@ -2141,7 +2141,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	    get_capacity(ca->disk_sb.bdev->bd_disk) <
 	    ca->mi.bucket_size * nbuckets) {
 		bch_err(ca, "New size larger than device");
-		ret = -BCH_ERR_device_size_too_small;
+		ret = bch_err_throw(c, device_size_too_small);
 		goto err;
 	}
 
@@ -2376,7 +2376,7 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
 	}
 
 	if (opts->nochanges && !opts->read_only) {
-		ret = -BCH_ERR_erofs_nochanges;
+		ret = bch_err_throw(c, erofs_nochanges);
 		goto err_print;
 	}
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index eb3ca963fc58..dc09532796af 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio,
 		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
+/* errors */
+
+TRACE_EVENT(error_throw,
+	TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
+	TP_ARGS(c, bch_err, ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(int,		err			)
+		__array(char,		err_str, 32		)
+		__array(char,		ip, 32			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->err		= bch_err;
+		strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
+		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+	),
+
+	TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ip, __entry->err_str)
+);
+
+TRACE_EVENT(error_downcast,
+	TP_PROTO(int bch_err, int std_err, unsigned long ip),
+	TP_ARGS(bch_err, std_err, ip),
+
+	TP_STRUCT__entry(
+		__array(char,		bch_err, 32		)
+		__array(char,		std_err, 32		)
+		__array(char,		ip, 32			)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+		strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+	),
+
+	TP_printk("%s ret %s -> %s %s", __entry->ip,
+		  __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
 /* disk_accounting.c */
 
 TRACE_EVENT(accounting_mem_insert,
@@ -1446,25 +1490,6 @@ DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
 	TP_ARGS(c, str)
 );
 
-TRACE_EVENT(error_downcast,
-	TP_PROTO(int bch_err, int std_err, unsigned long ip),
-	TP_ARGS(bch_err, std_err, ip),
-
-	TP_STRUCT__entry(
-		__array(char,		bch_err, 32		)
-		__array(char,		std_err, 32		)
-		__array(char,		ip, 32			)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
-		strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
-		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
-	),
-
-	TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
-);
-
 #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
 
 TRACE_EVENT(update_by_path,

From a2ffab0e659831761443f3ae6341c30e845dce43 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 12:48:00 -0400
Subject: [PATCH 54/69] bcachefs: bch2_require_recovery_pass()

Add a helper for requiring that a recovery pass has already run: either
run it directly, if we're still in recovery, or if we're not in recovery
check if it has run recently and schedule it if it hasn't.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h         |  7 +++++--
 fs/bcachefs/recovery_passes.c | 29 +++++++++++++++++++++++++++++
 fs/bcachefs/recovery_passes.h |  3 +++
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 6b0791e1e64d..6e0bf7ff2641 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -182,9 +182,12 @@
 	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
 	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
-	x(EINVAL,			restart_recovery)			\
-	x(EINVAL,			cannot_rewind_recovery)			\
+	x(EINVAL,			recovery_will_run)			\
+	x(BCH_ERR_recovery_will_run,	restart_recovery)			\
+	x(BCH_ERR_recovery_will_run,	cannot_rewind_recovery)			\
+	x(BCH_ERR_recovery_will_run,	recovery_pass_will_run)			\
 	x(0,				data_update_done)			\
+	x(0,				bkey_was_deleted)			\
 	x(BCH_ERR_data_update_done,	data_update_done_would_block)		\
 	x(BCH_ERR_data_update_done,	data_update_done_unwritten)		\
 	x(BCH_ERR_data_update_done,	data_update_done_no_writes_needed)	\
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index f6ddbab5ee82..9da43452fb30 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -384,6 +384,35 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 	return ret;
 }
 
+/*
+ * Returns 0 if @pass has run recently, otherwise one of
+ * -BCH_ERR_restart_recovery
+ * -BCH_ERR_recovery_pass_will_run
+ */
+int bch2_require_recovery_pass(struct bch_fs *c,
+			       struct printbuf *out,
+			       enum bch_recovery_pass pass)
+{
+	if (test_bit(BCH_FS_in_recovery, &c->flags) &&
+	    c->recovery.passes_complete & BIT_ULL(pass))
+		return 0;
+
+	guard(mutex)(&c->sb_lock);
+
+	if (bch2_recovery_pass_want_ratelimit(c, pass))
+		return 0;
+
+	enum bch_run_recovery_pass_flags flags = 0;
+	int ret = 0;
+
+	if (recovery_pass_needs_set(c, pass, &flags)) {
+		ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+		bch2_write_super(c);
+	}
+
+	return ret ?: bch_err_throw(c, recovery_pass_will_run);
+}
+
 int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
 	enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent;
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index dc0d2014ff9b..a97a462b5e11 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -24,6 +24,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
 				    enum bch_recovery_pass,
 				    enum bch_run_recovery_pass_flags);
 
+int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
+			       enum bch_recovery_pass);
+
 int bch2_run_online_recovery_passes(struct bch_fs *, u64);
 int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
 

From 0942b852d4070e448231d2e204ac82ad47f5920a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 13:01:44 -0400
Subject: [PATCH 55/69] bcachefs: BCH_RECOVERY_PASS_NO_RATELIMIT

Add a superblock flag to temporarily disable ratelimiting for a recovery
pass.

This will be used to make check_key_has_snapshot safer: we don't want to
delete a key for a missing snapshot unless we know that the snapshots
and subvolumes btrees are consistent, i.e. check_snapshots and
check_subvols have run recently.

Changing those btrees - creating/deleting a subvolume or snapshot - will
set the "disable ratelimit" flag, i.e. ensuring that those passes run if
check_key_has_snapshot discovers an error.

We're only disabling ratelimiting in the snapshot/subvol delete paths,
we're not so concerned about the create paths.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery_passes.c        | 55 ++++++++++++++++++++++------
 fs/bcachefs/recovery_passes.h        |  2 +
 fs/bcachefs/recovery_passes_format.h |  2 +
 fs/bcachefs/snapshot.c               |  2 +
 fs/bcachefs/subvolume.c              |  5 ++-
 5 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 9da43452fb30..605588e33fb3 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -103,20 +103,20 @@ static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
 		prt_tab(out);
 
 		bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
+
+		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+			prt_str(out, " (no ratelimit)");
+
 		prt_newline(out);
 	}
 }
 
-static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
-					   enum bch_recovery_pass pass,
-					   s64 start_time)
+static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
+							       enum bch_recovery_pass pass)
 {
 	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
-	s64 end_time = ktime_get_real_seconds();
 
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-	__clear_bit_le64(stable, ext->recovery_passes_required);
+	lockdep_assert_held(&c->sb_lock);
 
 	struct bch_sb_field_recovery_passes *r =
 		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
@@ -127,15 +127,43 @@ static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
 		r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
 		if (!r) {
 			bch_err(c, "error creating recovery_passes sb section");
-			goto out;
+			return NULL;
 		}
 	}
 
-	r->start[stable].last_run	= cpu_to_le64(end_time);
-	r->start[stable].last_runtime	= cpu_to_le32(max(0, end_time - start_time));
-out:
+	return r->start + stable;
+}
+
+static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
+					   enum bch_recovery_pass pass,
+					   s64 start_time)
+{
+	guard(mutex)(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+	__clear_bit_le64(bch2_recovery_pass_to_stable(pass),
+			 ext->recovery_passes_required);
+
+	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+	if (e) {
+		s64 end_time	= ktime_get_real_seconds();
+		e->last_run	= cpu_to_le64(end_time);
+		e->last_runtime	= cpu_to_le32(max(0, end_time - start_time));
+		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+	}
+
 	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
+}
+
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
+					 enum bch_recovery_pass pass)
+{
+	guard(mutex)(&c->sb_lock);
+
+	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+	if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
+		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+		bch2_write_super(c);
+	}
 }
 
 static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
@@ -157,6 +185,9 @@ static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recover
 		 */
 		ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
 			ktime_get_real_seconds() - le64_to_cpu(i->last_run);
+
+		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+			ret = false;
 	}
 
 	return ret;
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index a97a462b5e11..260571c7105e 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -10,6 +10,8 @@ u64 bch2_recovery_passes_from_stable(u64 v);
 
 u64 bch2_fsck_recovery_passes(void);
 
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
+
 enum bch_run_recovery_pass_flags {
 	RUN_RECOVERY_PASS_nopersistent	= BIT(0),
 	RUN_RECOVERY_PASS_ratelimit	= BIT(1),
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
index c434eafbca19..b63c20558d3d 100644
--- a/fs/bcachefs/recovery_passes_format.h
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -87,6 +87,8 @@ struct recovery_pass_entry {
 	__le32			flags;
 };
 
+LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT,	struct recovery_pass_entry, flags, 0, 1)
+
 struct bch_sb_field_recovery_passes {
 	struct bch_sb_field	field;
 	struct recovery_pass_entry start[];
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 24818d29982d..0649bdd5e4d2 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1878,6 +1878,8 @@ int __bch2_delete_dead_snapshots(struct bch_fs *c)
 	d->running = false;
 	mutex_unlock(&d->progress_lock);
 	bch2_trans_put(trans);
+
+	bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
 out_unlock:
 	mutex_unlock(&d->lock);
 	if (!bch2_err_matches(ret, EROFS))
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index e22837396bd9..77ba50df2ff4 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -482,9 +482,12 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 
 static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
-	return bch2_subvolumes_reparent(trans, subvolid) ?:
+	int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
 		commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 			  __bch2_subvolume_delete(trans, subvolid));
+
+	bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
+	return ret;
 }
 
 static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)

From e49cf9b54bc8b4c41c7aac8f12adb709f2015470 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 13:10:43 -0400
Subject: [PATCH 56/69] bcachefs: Make check_key_has_snapshot safer

Snapshot deletion v2 added sentinal values for deleted snapshots, so
"key for deleted snapshot" - i.e. snapshot deletion missed something -
is safe to repair automatically.

But if we find a key for a missing snapshot we have no idea what
happened, and we shouldn't delete it unless we're very sure that
everything else is consistent.

So hook it up to the new bch2_require_recovery_pass(), we'll now only
delete if snapshots and subvolumes have recenlty been checked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 13 ++++-------
 fs/bcachefs/snapshot.c    | 46 ++++++++++++++++++++++++++++-----------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 92035859e588..5f1174348974 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -822,16 +822,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 	int ret = 0;
 
 	if (k.k->p.snapshot) {
-		/*
-		 * We'll go ERO if we see a key for a missing snapshot, and if
-		 * we're still in recovery we want to give that a chance to
-		 * repair:
-		 */
-		if (unlikely(test_bit(BCH_FS_in_recovery, &c->flags) &&
-			     bch2_snapshot_id_state(c, k.k->p.snapshot) == SNAPSHOT_ID_empty))
-			return bch_err_throw(c, data_update_done_no_snapshot);
-
 		ret = bch2_check_key_has_snapshot(trans, iter, k);
+		if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
+			/* Can't repair yet, waiting on other recovery passes */
+			return bch_err_throw(c, data_update_done_no_snapshot);
+		}
 		if (ret < 0)
 			return ret;
 		if (ret) /* key was deleted */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0649bdd5e4d2..35dff323bfdb 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1045,19 +1045,39 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans,
 		ret = bch2_btree_delete_at(trans, iter,
 					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
 
-	/*
-	 * Snapshot missing: we should have caught this with btree_lost_data and
-	 * kicked off reconstruct_snapshots, so if we end up here we have no
-	 * idea what happened:
-	 */
-	if (fsck_err_on(state == SNAPSHOT_ID_empty,
-			trans, bkey_in_missing_snapshot,
-			"key in missing snapshot %s, delete?",
-			(bch2_btree_id_to_text(&buf, iter->btree_id),
-			 prt_char(&buf, ' '),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		ret = bch2_btree_delete_at(trans, iter,
-					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
+	if (state == SNAPSHOT_ID_empty) {
+		/*
+		 * Snapshot missing: we should have caught this with btree_lost_data and
+		 * kicked off reconstruct_snapshots, so if we end up here we have no
+		 * idea what happened.
+		 *
+		 * Do not delete unless we know that subvolumes and snapshots
+		 * are consistent:
+		 *
+		 * XXX:
+		 *
+		 * We could be smarter here, and instead of using the generic
+		 * recovery pass ratelimiting, track if there have been any
+		 * changes to the snapshots or inodes btrees since those passes
+		 * last ran.
+		 */
+		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
+		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
+
+		if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
+			ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
+
+		unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
+
+		if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
+			     "key in missing snapshot %s, delete?",
+			     (bch2_btree_id_to_text(&buf, iter->btree_id),
+			      prt_char(&buf, ' '),
+			      bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_delete_at(trans, iter,
+						   BTREE_UPDATE_internal_snapshot_node) ?: 1;
+		}
+	}
 fsck_err:
 	printbuf_exit(&buf);
 	return ret;

From a4907d7f3380f19c11a6191feac85b563439012a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Jun 2025 18:24:18 -0400
Subject: [PATCH 57/69] bcachefs: Run snapshot deletion out of system_long_wq

We don't want this running out of the same workqueue, and blocking,
writes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 35dff323bfdb..23a332d76b32 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1935,7 +1935,7 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 
 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
 
-	if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work))
+	if (!queue_work(system_long_wq, &c->snapshot_delete.work))
 		enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
 }
 

From c72def523799a0b054fd7cbbed32509e365db55b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 19:12:25 -0400
Subject: [PATCH 58/69] bcachefs: Run check_dirents second time if required

If we move a key backwards, we'll need a second pass to run the rest of
the fsck checks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c   |  3 ++-
 fs/bcachefs/fsck.c     | 39 ++++++++++++++++++++++++++-------------
 fs/bcachefs/str_hash.c | 25 +++++++++++++++----------
 fs/bcachefs/str_hash.h |  9 ++++++---
 4 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 62bb88250861..0d77b7e3632c 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -705,8 +705,9 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum,
 
 			subvol_inum target;
 
+			bool need_second_pass = false;
 			int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
-							   hash_info, &iter, k) ?:
+							   hash_info, &iter, k, &need_second_pass) ?:
 				bch2_dirent_read_target(trans, inum, dirent, &target);
 			if (ret2 > 0)
 				continue;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a7bf949df816..458f61c33958 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2141,7 +2141,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			struct bch_hash_info *hash_info,
 			struct inode_walker *dir,
 			struct inode_walker *target,
-			struct snapshots_seen *s)
+			struct snapshots_seen *s,
+			bool *need_second_pass)
 {
 	struct bch_fs *c = trans->c;
 	struct inode_walker_entry *i;
@@ -2183,7 +2184,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	dir->first_this_inode = false;
 
-	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
+	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
+				      iter, k, need_second_pass);
 	if (ret < 0)
 		goto err;
 	if (ret) {
@@ -2228,7 +2230,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 						    STR_HASH_must_create) ?:
 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 
-		/* might need another check_dirents pass */
+		if (dir_offset < k.k->p.offset)
+			*need_second_pass = true;
 		goto out;
 	}
 
@@ -2296,7 +2299,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 err:
 fsck_err:
 	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -2310,17 +2312,30 @@ int bch2_check_dirents(struct bch_fs *c)
 	struct inode_walker target = inode_walker_init();
 	struct snapshots_seen s;
 	struct bch_hash_info hash_info;
+	bool need_second_pass = false, did_second_pass = false;
 
 	snapshots_seen_init(&s);
-
+again:
 	int ret = bch2_trans_run(c,
 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
 				POS(BCACHEFS_ROOT_INO, 0),
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
+				     &need_second_pass)) ?:
 		check_subdir_count_notnested(trans, &dir));
 
+	if (!ret && need_second_pass && !did_second_pass) {
+		bch_info(c, "check_dirents requires second pass");
+		swap(did_second_pass, need_second_pass);
+		goto again;
+	}
+
+	if (!ret && need_second_pass) {
+		bch_err(c, "dirents not repairing");
+		ret = -EINVAL;
+	}
+
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
 	inode_walker_exit(&target);
@@ -2334,16 +2349,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 		       struct inode_walker *inode)
 {
 	struct bch_fs *c = trans->c;
-	struct inode_walker_entry *i;
-	int ret;
 
-	ret = bch2_check_key_has_snapshot(trans, iter, k);
+	int ret = bch2_check_key_has_snapshot(trans, iter, k);
 	if (ret < 0)
 		return ret;
 	if (ret)
 		return 0;
 
-	i = walk_inode(trans, inode, k);
+	struct inode_walker_entry *i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		return ret;
@@ -2359,9 +2372,9 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	inode->first_this_inode = false;
 
-	ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
-	bch_err_fn(c, ret);
-	return ret;
+	bool need_second_pass = false;
+	return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
+				      iter, k, &need_second_pass);
 }
 
 /*
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 36d7ca7b0b1a..e6ecc8a549ba 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -35,7 +35,8 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
 				       struct snapshots_seen *s,
 				       const struct bch_hash_desc desc,
 				       struct bch_hash_info *hash_info,
-				       struct bkey_s_c_dirent old)
+				       struct bkey_s_c_dirent old,
+				       bool *updated_before_k_pos)
 {
 	struct qstr old_name = bch2_dirent_get_name(old);
 	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
@@ -62,16 +63,15 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
 						old.k->p.snapshot, &new->k_i,
 						BTREE_UPDATE_internal_snapshot_node);
 		if (ret && !bch2_err_matches(ret, EEXIST))
-			goto err;
-		if (!ret)
 			break;
+		if (!ret) {
+			if (bpos_lt(new->k.p, old.k->p))
+				*updated_before_k_pos = true;
+			break;
+		}
 	}
 
-	if (ret)
-		goto err;
-
-	ret = bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
-err:
+	ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
 	bch_err_fn(trans->c, ret);
 	return ret;
 }
@@ -230,7 +230,8 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 			      struct snapshots_seen *s,
 			      const struct bch_hash_desc *desc,
 			      struct bch_hash_info *hash_info,
-			      struct btree_iter *k_iter, struct bkey_s_c hash_k)
+			      struct btree_iter *k_iter, struct bkey_s_c hash_k,
+			      bool *updated_before_k_pos)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter = {};
@@ -310,6 +311,9 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 		if (k.k)
 			goto duplicate_entries;
 
+		if (bpos_lt(new->k.p, k.k->p))
+			*updated_before_k_pos = true;
+
 		ret =   bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
 						       k_iter->pos, new->k.p) ?:
 			bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
@@ -345,7 +349,8 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 		ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
 		break;
 	case 2:
-		ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+		ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k),
+					 updated_before_k_pos) ?:
 			bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
 		goto out;
 	}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ebcc006825cd..2d31bc2358d0 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -402,13 +402,15 @@ int __bch2_str_hash_check_key(struct btree_trans *,
 			      struct snapshots_seen *,
 			      const struct bch_hash_desc *,
 			      struct bch_hash_info *,
-			      struct btree_iter *, struct bkey_s_c);
+			      struct btree_iter *, struct bkey_s_c,
+			      bool *);
 
 static inline int bch2_str_hash_check_key(struct btree_trans *trans,
 			    struct snapshots_seen *s,
 			    const struct bch_hash_desc *desc,
 			    struct bch_hash_info *hash_info,
-			    struct btree_iter *k_iter, struct bkey_s_c hash_k)
+			    struct btree_iter *k_iter, struct bkey_s_c hash_k,
+			    bool *updated_before_k_pos)
 {
 	if (hash_k.k->type != desc->key_type)
 		return 0;
@@ -416,7 +418,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans,
 	if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
 		return 0;
 
-	return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+	return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
+					 updated_before_k_pos);
 }
 
 #endif /* _BCACHEFS_STR_HASH_H */

From 01d925f7e18360f740e7b75bfad9b6e55fd4dc53 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Wed, 4 Jun 2025 12:38:27 -0700
Subject: [PATCH 59/69] bcachefs: Fix -Wc23-extensions in bch2_check_dirents()

Clang warns (or errors with CONFIG_WERROR=y):

  fs/bcachefs/fsck.c:2325:2: error: label followed by a declaration is a C23 extension [-Werror,-Wc23-extensions]
   2325 |         int ret = bch2_trans_run(c,
        |         ^

On clang-17 and older, this is an unconditional error:

  fs/bcachefs/fsck.c:2325:2: error: expected expression
   2325 |         int ret = bch2_trans_run(c,
        |         ^

Move the declaration of ret to the top of the function to resolve both
ways this issue manifests.

Fixes: c72def523799 ("bcachefs: Run check_dirents second time if required")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 458f61c33958..dedb80f71049 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2313,10 +2313,11 @@ int bch2_check_dirents(struct bch_fs *c)
 	struct snapshots_seen s;
 	struct bch_hash_info hash_info;
 	bool need_second_pass = false, did_second_pass = false;
+	int ret;
 
 	snapshots_seen_init(&s);
 again:
-	int ret = bch2_trans_run(c,
+	ret = bch2_trans_run(c,
 		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
 				POS(BCACHEFS_ROOT_INO, 0),
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,

From 35c1f131bc5ff9d6fb9599b3ae5a15dd83a27e31 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Jun 2025 18:35:18 -0400
Subject: [PATCH 60/69] bcachefs: Redo bch2_dirent_init_name()

Redo (and simplify somewhat) how casefolded and non casefolded dirents
are initialized, and export this to be used by fsck_rename_dirent().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 139 ++++++++++++++++++++-----------------------
 fs/bcachefs/dirent.h |   7 ++-
 2 files changed, 71 insertions(+), 75 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 0d77b7e3632c..36bccd2a2763 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -231,67 +231,61 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
 }
 
-static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
-				subvol_inum dir,
-				u8 type,
-				int name_len, int cf_name_len,
-				u64 dst)
+int bch2_dirent_init_name(struct bkey_i_dirent *dirent,
+			  const struct bch_hash_info *hash_info,
+			  const struct qstr *name,
+			  const struct qstr *cf_name)
 {
-	struct bkey_i_dirent *dirent;
-	unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
+	EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
+	int cf_len = 0;
 
-	BUG_ON(u64s > U8_MAX);
+	if (name->len > BCH_NAME_MAX)
+		return -ENAMETOOLONG;
 
-	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-	if (IS_ERR(dirent))
-		return dirent;
+	dirent->v.d_casefold = hash_info->cf_encoding != NULL;
 
-	bkey_dirent_init(&dirent->k_i);
-	dirent->k.u64s = u64s;
-
-	if (type != DT_SUBVOL) {
-		dirent->v.d_inum = cpu_to_le64(dst);
+	if (!dirent->v.d_casefold) {
+		memcpy(&dirent->v.d_name[0], name->name, name->len);
+		memset(&dirent->v.d_name[name->len], 0,
+		       bkey_val_bytes(&dirent->k) -
+		       offsetof(struct bch_dirent, d_name) -
+		       name->len);
 	} else {
-		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
-		dirent->v.d_child_subvol = cpu_to_le32(dst);
+#ifdef CONFIG_UNICODE
+		memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
+
+		char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
+
+		if (cf_name) {
+			cf_len = cf_name->len;
+
+			memcpy(cf_out, cf_name->name, cf_name->len);
+		} else {
+			cf_len = utf8_casefold(hash_info->cf_encoding, name,
+					       cf_out,
+					       bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
+			if (cf_len <= 0)
+				return cf_len;
+		}
+
+		memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
+		       bkey_val_bytes(&dirent->k) -
+		       offsetof(struct bch_dirent, d_cf_name_block.d_names) -
+		       name->len + cf_len);
+
+		dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
+		dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
+
+		EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
+#else
+	return -EOPNOTSUPP;
+#endif
 	}
 
-	dirent->v.d_type = type;
-	dirent->v.d_unused = 0;
-	dirent->v.d_casefold = cf_name_len ? 1 : 0;
-
-	return dirent;
-}
-
-static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
-				     const struct qstr *name)
-{
-	EBUG_ON(dirent->v.d_casefold);
-
-	memcpy(&dirent->v.d_name[0], name->name, name->len);
-	memset(&dirent->v.d_name[name->len], 0,
-		bkey_val_bytes(&dirent->k) -
-		offsetof(struct bch_dirent, d_name) -
-		name->len);
-}
-
-static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
-					const struct qstr *name,
-					const struct qstr *cf_name)
-{
-	EBUG_ON(!dirent->v.d_casefold);
-	EBUG_ON(!cf_name->len);
-
-	dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
-	dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);
-	memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
-	memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
-	memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
-		bkey_val_bytes(&dirent->k) -
-		offsetof(struct bch_dirent, d_cf_name_block.d_names) -
-		name->len + cf_name->len);
-
-	EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
+	unsigned u64s = dirent_val_u64s(name->len, cf_len);
+	BUG_ON(u64s > bkey_val_u64s(&dirent->k));
+	set_bkey_val_u64s(&dirent->k, u64s);
+	return 0;
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -302,31 +296,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 				const struct qstr *cf_name,
 				u64 dst)
 {
-	struct bkey_i_dirent *dirent;
-	struct qstr _cf_name;
-
-	if (name->len > BCH_NAME_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	if (hash_info->cf_encoding && !cf_name) {
-		int ret = bch2_casefold(trans, hash_info, name, &_cf_name);
-		if (ret)
-			return ERR_PTR(ret);
-
-		cf_name = &_cf_name;
-	}
-
-	dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
+	struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
 	if (IS_ERR(dirent))
 		return dirent;
 
-	if (cf_name)
-		dirent_init_casefolded_name(dirent, name, cf_name);
-	else
-		dirent_init_regular_name(dirent, name);
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = BKEY_U64s_MAX;
+
+	if (type != DT_SUBVOL) {
+		dirent->v.d_inum = cpu_to_le64(dst);
+	} else {
+		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+		dirent->v.d_child_subvol = cpu_to_le32(dst);
+	}
+
+	dirent->v.d_type = type;
+	dirent->v.d_unused = 0;
+
+	int ret = bch2_dirent_init_name(dirent, hash_info, name, cf_name);
+	if (ret)
+		return ERR_PTR(ret);
 
 	EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
-
 	return dirent;
 }
 
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index f94d589cab6f..31848b5cd88c 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -38,7 +38,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans,
 	}
 }
 
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
 
 static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
 {
@@ -59,6 +59,11 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst,
 	dst->v.d_type = src.v->d_type;
 }
 
+int bch2_dirent_init_name(struct bkey_i_dirent *,
+			  const struct bch_hash_info *,
+			  const struct qstr *,
+			  const struct qstr *);
+
 int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
 			const struct bch_hash_info *, u8,
 			const struct qstr *, u64, u64 *,

From b938d3c970175b2f3d22865dc077482fc6137828 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 17:00:00 -0400
Subject: [PATCH 61/69] bcachefs: Fix bch2_fsck_rename_dirent() for casefold

bch2_fsck_renamed_dirent was creating bch_dirent keys open-coded - but
we need to use the appropriate helper, if the directory is casefolded.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c     |  4 ++++
 fs/bcachefs/str_hash.c | 23 +++++++++++++++--------
 fs/bcachefs/super.c    | 11 ++++++-----
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index dedb80f71049..4a72dbdcc0e6 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2184,6 +2184,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &i->inode);
 	dir->first_this_inode = false;
 
+#ifdef CONFIG_UNICODE
+	hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
+#endif
+
 	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
 				      iter, k, need_second_pass);
 	if (ret < 0)
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index e6ecc8a549ba..7904bf15717b 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -39,7 +39,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
 				       bool *updated_before_k_pos)
 {
 	struct qstr old_name = bch2_dirent_get_name(old);
-	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
 	int ret = PTR_ERR_OR_ZERO(new);
 	if (ret)
 		return ret;
@@ -48,20 +48,27 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
 	dirent_copy_target(new, old);
 	new->k.p = old.k->p;
 
+	char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
+	ret = PTR_ERR_OR_ZERO(renamed_buf);
+	if (ret)
+		return ret;
+
 	for (unsigned i = 0; i < 1000; i++) {
-		unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
-				       old_name.len, old_name.name, i);
-		unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
+		new->k.u64s = BKEY_U64s_MAX;
 
-		if (u64s > U8_MAX)
-			return -EINVAL;
+		struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
+					sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
+						old_name.len, old_name.name, i));
 
-		new->k.u64s = u64s;
+		ret = bch2_dirent_init_name(new, hash_info, &renamed_name, NULL);
+		if (ret)
+			return ret;
 
 		ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
 						(subvol_inum) { 0, old.k->p.inode },
 						old.k->p.snapshot, &new->k_i,
-						BTREE_UPDATE_internal_snapshot_node);
+						BTREE_UPDATE_internal_snapshot_node|
+						STR_HASH_must_create);
 		if (ret && !bch2_err_matches(ret, EEXIST))
 			break;
 		if (!ret) {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 21b1b227f89f..397a69da5a75 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1148,11 +1148,12 @@ int bch2_fs_start(struct bch_fs *c)
 
 	print_mount_opts(c);
 
-	if (IS_ENABLED(CONFIG_UNICODE))
-		bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
-			 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
-			 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
-			 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+#ifdef CONFIG_UNICODE
+	bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
+		 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+		 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+		 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+#endif
 
 	if (!bch2_fs_may_start(c))
 		return bch_err_throw(c, insufficient_devices_to_start);

From 2bf380c005adcc653464215a5170afa3367c0b22 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 May 2025 00:11:52 -0400
Subject: [PATCH 62/69] bcachefs: Fix dirent_casefold_mismatch repair

Instead of simply recreating a mis-casefolded dirent, use the str_hash
repair code, which will rename it if necessary - the dirent might have
been created again with the correct casefolding.

Factor out out bch2_str_hash_repair key() from
__bch2_str_hash_check_key() for the new path to use, and export
bch2_dirent_create_key() as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c   |  14 +--
 fs/bcachefs/dirent.h   |   3 +
 fs/bcachefs/fsck.c     |  42 ++++----
 fs/bcachefs/str_hash.c | 213 +++++++++++++++++++++++------------------
 fs/bcachefs/str_hash.h |   8 ++
 5 files changed, 162 insertions(+), 118 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 36bccd2a2763..300f7cc8abdf 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -288,7 +288,7 @@ int bch2_dirent_init_name(struct bkey_i_dirent *dirent,
 	return 0;
 }
 
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
 				const struct bch_hash_info *hash_info,
 				subvol_inum dir,
 				u8 type,
@@ -332,7 +332,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
 	struct bkey_i_dirent *dirent;
 	int ret;
 
-	dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
+	dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
 	ret = PTR_ERR_OR_ZERO(dirent);
 	if (ret)
 		return ret;
@@ -356,7 +356,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
 	struct bkey_i_dirent *dirent;
 	int ret;
 
-	dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
+	dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
 	ret = PTR_ERR_OR_ZERO(dirent);
 	if (ret)
 		return ret;
@@ -461,8 +461,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		*src_offset = dst_iter.pos.offset;
 
 	/* Create new dst key: */
-	new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
-				    dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
+	new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
+					 dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
 	ret = PTR_ERR_OR_ZERO(new_dst);
 	if (ret)
 		goto out;
@@ -472,8 +472,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
 
 	/* Create new src key: */
 	if (mode == BCH_RENAME_EXCHANGE) {
-		new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name,
-					    src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
+		new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
+						 src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
 		ret = PTR_ERR_OR_ZERO(new_src);
 		if (ret)
 			goto out;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 31848b5cd88c..70fb0b581221 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -63,6 +63,9 @@ int bch2_dirent_init_name(struct bkey_i_dirent *,
 			  const struct bch_hash_info *,
 			  const struct qstr *,
 			  const struct qstr *);
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
+				const struct bch_hash_info *, subvol_inum, u8,
+				const struct qstr *, const struct qstr *, u64);
 
 int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
 			const struct bch_hash_info *, u8,
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 4a72dbdcc0e6..68ed69a255e1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2210,32 +2210,34 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, k),
 			 buf.buf))) {
-		struct qstr name = bch2_dirent_get_name(d);
-		u32 subvol = d.v->d_type == DT_SUBVOL
-			? le32_to_cpu(d.v->d_parent_subvol)
-			: 0;
+		subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
+				? le32_to_cpu(d.v->d_parent_subvol)
+				: 0,
+		};
 		u64 target = d.v->d_type == DT_SUBVOL
 			? le32_to_cpu(d.v->d_child_subvol)
 			: le64_to_cpu(d.v->d_inum);
-		u64 dir_offset;
+		struct qstr name = bch2_dirent_get_name(d);
 
-		ret =   bch2_hash_delete_at(trans,
+		struct bkey_i_dirent *new_d =
+			bch2_dirent_create_key(trans, hash_info, dir_inum,
+					       d.v->d_type, &name, NULL, target);
+		ret = PTR_ERR_OR_ZERO(new_d);
+		if (ret)
+			goto out;
+
+		new_d->k.p.inode	= d.k->p.inode;
+		new_d->k.p.snapshot	= d.k->p.snapshot;
+
+		struct btree_iter dup_iter = {};
+		ret =	bch2_hash_delete_at(trans,
 					    bch2_dirent_hash_desc, hash_info, iter,
 					    BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_dirent_create_snapshot(trans, subvol,
-						    d.k->p.inode, d.k->p.snapshot,
-						    hash_info,
-						    d.v->d_type,
-						    &name,
-						    target,
-						    &dir_offset,
-						    BTREE_ITER_with_updates|
-						    BTREE_UPDATE_internal_snapshot_node|
-						    STR_HASH_must_create) ?:
-			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
-		if (dir_offset < k.k->p.offset)
-			*need_second_pass = true;
+			bch2_str_hash_repair_key(trans, s,
+						 &bch2_dirent_hash_desc, hash_info,
+						 iter, bkey_i_to_s_c(&new_d->k_i),
+						 &dup_iter, bkey_s_c_null,
+						 need_second_pass);
 		goto out;
 	}
 
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 7904bf15717b..71b735a85026 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -31,12 +31,12 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
 	}
 }
 
-static noinline int fsck_rename_dirent(struct btree_trans *trans,
-				       struct snapshots_seen *s,
-				       const struct bch_hash_desc desc,
-				       struct bch_hash_info *hash_info,
-				       struct bkey_s_c_dirent old,
-				       bool *updated_before_k_pos)
+static int bch2_fsck_rename_dirent(struct btree_trans *trans,
+				   struct snapshots_seen *s,
+				   const struct bch_hash_desc desc,
+				   struct bch_hash_info *hash_info,
+				   struct bkey_s_c_dirent old,
+				   bool *updated_before_k_pos)
 {
 	struct qstr old_name = bch2_dirent_get_name(old);
 	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
@@ -233,54 +233,20 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans
 	return ret;
 }
 
-int __bch2_str_hash_check_key(struct btree_trans *trans,
-			      struct snapshots_seen *s,
-			      const struct bch_hash_desc *desc,
-			      struct bch_hash_info *hash_info,
-			      struct btree_iter *k_iter, struct bkey_s_c hash_k,
-			      bool *updated_before_k_pos)
+/* Put a str_hash key in its proper location, checking for duplicates */
+int bch2_str_hash_repair_key(struct btree_trans *trans,
+			     struct snapshots_seen *s,
+			     const struct bch_hash_desc *desc,
+			     struct bch_hash_info *hash_info,
+			     struct btree_iter *k_iter, struct bkey_s_c k,
+			     struct btree_iter *dup_iter, struct bkey_s_c dup_k,
+			     bool *updated_before_k_pos)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter iter = {};
 	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c k;
 	bool free_snapshots_seen = false;
 	int ret = 0;
 
-	u64 hash = desc->hash_bkey(hash_info, hash_k);
-	if (hash_k.k->p.offset < hash)
-		goto bad_hash;
-
-	for_each_btree_key_norestart(trans, iter, desc->btree_id,
-				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-				     BTREE_ITER_slots|
-				     BTREE_ITER_with_updates, k, ret) {
-		if (bkey_eq(k.k->p, hash_k.k->p))
-			break;
-
-		if (k.k->type == desc->key_type &&
-		    !desc->cmp_bkey(k, hash_k))
-			goto duplicate_entries;
-
-		if (bkey_deleted(k.k)) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto bad_hash;
-		}
-	}
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	if (free_snapshots_seen)
-		darray_exit(&s->ids);
-	return ret;
-bad_hash:
-	/*
-	 * Before doing any repair, check hash_info itself:
-	 */
-	ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
-	if (ret)
-		goto out;
-
 	if (!s) {
 		s = bch2_trans_kmalloc(trans, sizeof(*s));
 		ret = PTR_ERR_OR_ZERO(s);
@@ -297,25 +263,22 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 		free_snapshots_seen = true;
 	}
 
-	if (fsck_err(trans, hash_table_key_wrong_offset,
-		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
-		     bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
-		     (printbuf_reset(&buf),
-		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
-		if (IS_ERR(new))
-			return PTR_ERR(new);
+	if (!dup_k.k) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto out;
 
-		k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
-				       (subvol_inum) { 0, hash_k.k->p.inode },
-				       hash_k.k->p.snapshot, new,
+		dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
+				       (subvol_inum) { 0, new->k.p.inode },
+				       new->k.p.snapshot, new,
 				       STR_HASH_must_create|
 				       BTREE_ITER_with_updates|
 				       BTREE_UPDATE_internal_snapshot_node);
-		ret = bkey_err(k);
+		ret = bkey_err(dup_k);
 		if (ret)
 			goto out;
-		if (k.k)
+		if (dup_k.k)
 			goto duplicate_entries;
 
 		if (bpos_lt(new->k.p, k.k->p))
@@ -329,40 +292,108 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 			bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
 			-BCH_ERR_transaction_restart_commit;
-		goto out;
-	}
-fsck_err:
-	goto out;
+	} else {
 duplicate_entries:
-	ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
-	if (ret < 0)
-		goto out;
+		ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
+		if (ret < 0)
+			goto out;
 
-	if (!fsck_err(trans, hash_table_key_duplicate,
-		      "duplicate hash table keys%s:\n%s",
-		      ret != 2 ? "" : ", both point to valid inodes",
-		      (printbuf_reset(&buf),
-		       bch2_bkey_val_to_text(&buf, c, hash_k),
-		       prt_newline(&buf),
-		       bch2_bkey_val_to_text(&buf, c, k),
-		       buf.buf)))
-		goto out;
+		if (!fsck_err(trans, hash_table_key_duplicate,
+			      "duplicate hash table keys%s:\n%s",
+			      ret != 2 ? "" : ", both point to valid inodes",
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, k),
+			       prt_newline(&buf),
+			       bch2_bkey_val_to_text(&buf, c, dup_k),
+			       buf.buf)))
+			goto out;
 
-	switch (ret) {
-	case 0:
-		ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
-		break;
-	case 1:
-		ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
-		break;
-	case 2:
-		ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k),
-					 updated_before_k_pos) ?:
-			bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
-		goto out;
+		switch (ret) {
+		case 0:
+			ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+			break;
+		case 1:
+			ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
+			break;
+		case 2:
+			ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
+						      bkey_s_c_to_dirent(k),
+						      updated_before_k_pos) ?:
+				bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+						    BTREE_ITER_with_updates);
+			goto out;
+		}
+
+		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+			-BCH_ERR_transaction_restart_commit;
 	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, dup_iter);
+	printbuf_exit(&buf);
+	if (free_snapshots_seen)
+		darray_exit(&s->ids);
+	return ret;
+}
 
-	ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-		-BCH_ERR_transaction_restart_commit;
+int __bch2_str_hash_check_key(struct btree_trans *trans,
+			      struct snapshots_seen *s,
+			      const struct bch_hash_desc *desc,
+			      struct bch_hash_info *hash_info,
+			      struct btree_iter *k_iter, struct bkey_s_c hash_k,
+			      bool *updated_before_k_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = {};
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	u64 hash = desc->hash_bkey(hash_info, hash_k);
+	if (hash_k.k->p.offset < hash)
+		goto bad_hash;
+
+	for_each_btree_key_norestart(trans, iter, desc->btree_id,
+				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+				     BTREE_ITER_slots|
+				     BTREE_ITER_with_updates, k, ret) {
+		if (bkey_eq(k.k->p, hash_k.k->p))
+			break;
+
+		if (k.k->type == desc->key_type &&
+		    !desc->cmp_bkey(k, hash_k)) {
+			ret =	check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
+								   hash_info) ?:
+				bch2_str_hash_repair_key(trans, s, desc, hash_info,
+							 k_iter, hash_k,
+							 &iter, k, updated_before_k_pos);
+			break;
+		}
+
+		if (bkey_deleted(k.k))
+			goto bad_hash;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+bad_hash:
+	bch2_trans_iter_exit(trans, &iter);
+	/*
+	 * Before doing any repair, check hash_info itself:
+	 */
+	ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
+	if (ret)
+		goto out;
+
+	if (fsck_err(trans, hash_table_key_wrong_offset,
+		     "hash table key at wrong offset: should be at %llu\n%s",
+		     hash,
+		     (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
+		ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
+					       k_iter, hash_k,
+					       &iter, bkey_s_c_null,
+					       updated_before_k_pos);
 	goto out;
 }
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 2d31bc2358d0..79d51aef70aa 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -398,6 +398,14 @@ int bch2_hash_delete(struct btree_trans *trans,
 int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
 
 struct snapshots_seen;
+int bch2_str_hash_repair_key(struct btree_trans *,
+			     struct snapshots_seen *,
+			     const struct bch_hash_desc *,
+			     struct bch_hash_info *,
+			     struct btree_iter *, struct bkey_s_c,
+			     struct btree_iter *, struct bkey_s_c,
+			     bool *);
+
 int __bch2_str_hash_check_key(struct btree_trans *,
 			      struct snapshots_seen *,
 			      const struct bch_hash_desc *,

From 9f2dc5f39431a17d304e5845a3f7e78905473442 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Jun 2025 09:26:20 -0400
Subject: [PATCH 63/69] bcachefs: Fix oops in btree_node_seq_matches()

btree_update_nodes_written() needs to wait on in-flight writes to old
nodes before marking them as freed. But it has no reason to pin those
old nodes in memory, so some trickyness ensues.

The update we're completing deleted references to those nodes from the
btree, so we know if they've been evicted they can't be pulled back in.
We just have to check if the nodes we have pointers to are still those
old nodes, and haven't been reused.

To do that we check the node's "sequence number" (actually a random 64
bit cookie), but that lives in the node's data buffer. 'struct btree'
can't be freed until filesystem shutdown (as they're quite small), but
the data buffers can be freed or swapped around.

Commit 1f88c3567495, which was fixing a kmsan warning, assumed that we
could safely do this locklessly with just a READ_ONCE() - if we've got a
non-null ptr it would be safe to read from.

But that's not true if the data buffer is a vmalloc allocation, so we
need to restore the locking that commit deleted (or alternatively RCU
free those data buffers, but there's no other reason for that).

Fixes: 1f88c3567495 ("bcachefs: Fix a KMSAN splat in btree_update_nodes_written()")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e3639008be5c..d2ecb782919b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -684,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as)
 
 	/*
 	 * Wait for any in flight writes to finish before we free the old nodes
-	 * on disk:
+	 * on disk. But we haven't pinned those old nodes in the btree cache,
+	 * they might have already been evicted.
+	 *
+	 * The update we're completing deleted references to those nodes from the
+	 * btree, so we know if they've been evicted they can't be pulled back in.
+	 * We just have to check if the nodes we have pointers to are still those
+	 * old nodes, and haven't been reused.
+	 *
+	 * This can't be done locklessly because the data buffer might have been
+	 * vmalloc allocated, and they're not RCU freed. We also need the
+	 * __no_kmsan_checks annotation because even with the btree node read
+	 * lock, nothing tells us that the data buffer has been initialized (if
+	 * the btree node has been reused for a different node, and the data
+	 * buffer swapped for a new data buffer).
 	 */
 	for (i = 0; i < as->nr_old_nodes; i++) {
 		b = as->old_nodes[i];
 
-		if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
+		bch2_trans_begin(trans);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
+		six_unlock_read(&b->c.lock);
+		bch2_trans_unlock_long(trans);
+
+		if (seq_matches)
 			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
 				       TASK_UNINTERRUPTIBLE);
 	}

From bfaac2c54694f72d08db5cfad0a1fce2a4c7e45e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Jun 2025 17:23:49 -0400
Subject: [PATCH 64/69] bcachefs: Add flags to subvolume_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 77ba50df2ff4..60c10365c285 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -255,6 +255,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 		prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
 		prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
 	}
+
+	if (BCH_SUBVOLUME_RO(s.v))
+		prt_printf(out, " ro");
+	if (BCH_SUBVOLUME_SNAP(s.v))
+		prt_printf(out, " snapshot");
+	if (BCH_SUBVOLUME_UNLINKED(s.v))
+		prt_printf(out, " unlinked");
 }
 
 static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)

From bb6689bbeebc6fb51f0f120b486bdcc9a38ffcf6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Jun 2025 18:26:44 -0400
Subject: [PATCH 65/69] bcachefs: delete dead code from
 may_delete_deleted_inode()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/inode.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 391daae5c171..e76721c11eef 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1392,10 +1392,7 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
 		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
 }
 
-static int may_delete_deleted_inode(struct btree_trans *trans,
-				    struct btree_iter *iter,
-				    struct bpos pos,
-				    bool *need_another_pass)
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter inode_iter;
@@ -1484,9 +1481,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 int bch2_delete_dead_inodes(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
-	bool need_another_pass;
 	int ret;
-again:
+
 	/*
 	 * if we ran check_inodes() unlinked inodes will have already been
 	 * cleaned up but the write buffer will be out of sync; therefore we
@@ -1496,8 +1492,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	need_another_pass = false;
-
 	/*
 	 * Weird transaction restart handling here because on successful delete,
 	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
@@ -1507,7 +1501,7 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
 					BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+		ret = may_delete_deleted_inode(trans, k.k->p);
 		if (ret > 0) {
 			bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
 						k.k->p.offset, k.k->p.snapshot);
@@ -1528,9 +1522,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 
 		ret;
 	}));
-
-	if (!ret && need_another_pass)
-		goto again;
 err:
 	bch2_trans_put(trans);
 	bch_err_fn(c, ret);

From 09fb85ae565645b982e9030dbb2ff6707f2cdddc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Jun 2025 17:43:36 -0400
Subject: [PATCH 66/69] bcachefs: Run may_delete_deleted_inode() checks in
 bch2_inode_rm()

We had a bug where bch2_evict_inode() incorrectly called bch2_inode_rm()
- the journal clearly showed the inode was not unlinked.

We've got checks that we use in recovery when cleaning up deleted
inodes, lift them to bch2_inode_rm() as well.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h          |  2 +
 fs/bcachefs/fs.c               |  8 +++-
 fs/bcachefs/inode.c            | 68 +++++++++++++++++++++++++---------
 fs/bcachefs/sb-errors_format.h |  3 +-
 4 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 6e0bf7ff2641..ac3264134a15 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -214,6 +214,8 @@
 	x(EINVAL,			remove_would_lose_data)			\
 	x(EINVAL,			no_resize_with_buckets_nouse)		\
 	x(EINVAL,			inode_unpack_error)			\
+	x(EINVAL,			inode_not_unlinked)			\
+	x(EINVAL,			inode_has_child_snapshot)		\
 	x(EINVAL,			varint_decode_error)			\
 	x(EINVAL,			erasure_coding_found_btree_node)	\
 	x(EINVAL,			option_negative)			\
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 41008d4a2b99..85d13f800165 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2180,7 +2180,13 @@ static void bch2_evict_inode(struct inode *vinode)
 				KEY_TYPE_QUOTA_WARN);
 		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 				KEY_TYPE_QUOTA_WARN);
-		bch2_inode_rm(c, inode_inum(inode));
+		int ret = bch2_inode_rm(c, inode_inum(inode));
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
+				    inode->ei_inum.subvol,
+				    inode->ei_inum.inum);
+			bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
+		}
 
 		/*
 		 * If we are deleting, we need it present in the vfs hash table
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e76721c11eef..53e5dc1f6ac1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = {
 #undef  x
 
 static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
 
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
 
@@ -1130,19 +1131,23 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 	u32 snapshot;
 	int ret;
 
+	ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
+	if (ret)
+		goto err2;
+
 	/*
 	 * If this was a directory, there shouldn't be any real dirents left -
 	 * but there could be whiteouts (from hash collisions) that we should
 	 * delete:
 	 *
-	 * XXX: the dirent could ideally would delete whiteouts when they're no
+	 * XXX: the dirent code ideally would delete whiteouts when they're no
 	 * longer needed
 	 */
 	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
 		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
 		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
 	if (ret)
-		goto err;
+		goto err2;
 retry:
 	bch2_trans_begin(trans);
 
@@ -1392,7 +1397,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
 		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
 }
 
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
+				    bool from_deleted_inodes)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter inode_iter;
@@ -1406,12 +1412,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 	if (ret)
 		return ret;
 
-	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
-	if (fsck_err_on(!bkey_is_inode(k.k),
+	ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
+	if (fsck_err_on(from_deleted_inodes && ret,
 			trans, deleted_inode_missing,
 			"nonexistent inode %llu:%u in deleted_inodes btree",
 			pos.offset, pos.snapshot))
 		goto delete;
+	if (ret)
+		goto out;
 
 	ret = bch2_inode_unpack(k, &inode);
 	if (ret)
@@ -1419,7 +1427,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 
 	if (S_ISDIR(inode.bi_mode)) {
 		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
-		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+		if (fsck_err_on(from_deleted_inodes &&
+				bch2_err_matches(ret, ENOTEMPTY),
 				trans, deleted_inode_is_dir,
 				"non empty directory %llu:%u in deleted_inodes btree",
 				pos.offset, pos.snapshot))
@@ -1428,17 +1437,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 			goto out;
 	}
 
-	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+	ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
+	if (fsck_err_on(from_deleted_inodes && ret,
 			trans, deleted_inode_not_unlinked,
 			"non-deleted inode %llu:%u in deleted_inodes btree",
 			pos.offset, pos.snapshot))
 		goto delete;
+	if (ret)
+		goto out;
 
-	if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+	ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
+		? 0 : bch_err_throw(c, inode_has_child_snapshot);
+
+	if (fsck_err_on(from_deleted_inodes && ret,
 			trans, deleted_inode_has_child_snapshots,
 			"inode with child snapshots %llu:%u in deleted_inodes btree",
 			pos.offset, pos.snapshot))
 		goto delete;
+	if (ret)
+		goto out;
 
 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
 	if (ret < 0)
@@ -1455,19 +1472,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 			if (ret)
 				goto out;
 		}
+
+		if (!from_deleted_inodes) {
+			ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+				bch_err_throw(c, inode_has_child_snapshot);
+			goto out;
+		}
+
 		goto delete;
 
 	}
 
-	if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
-	    !fsck_err(trans, deleted_inode_but_clean,
-		      "filesystem marked as clean but have deleted inode %llu:%u",
-		      pos.offset, pos.snapshot)) {
-		ret = 0;
-		goto out;
-	}
+	if (from_deleted_inodes) {
+		if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
+		    !fsck_err(trans, deleted_inode_but_clean,
+			      "filesystem marked as clean but have deleted inode %llu:%u",
+			      pos.offset, pos.snapshot)) {
+			ret = 0;
+			goto out;
+		}
 
-	ret = 1;
+		ret = 1;
+	}
 out:
 fsck_err:
 	bch2_trans_iter_exit(trans, &inode_iter);
@@ -1478,6 +1504,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
 	goto out;
 }
 
+static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
+{
+	u32 snapshot;
+
+	return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
+}
+
 int bch2_delete_dead_inodes(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
@@ -1501,7 +1535,7 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
 					BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		ret = may_delete_deleted_inode(trans, k.k->p);
+		ret = may_delete_deleted_inode(trans, k.k->p, true);
 		if (ret > 0) {
 			bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
 						k.k->p.offset, k.k->p.snapshot);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 8999dab1ff9b..6fdbf265e4c0 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -244,6 +244,7 @@ enum bch_fsck_flags {
 	x(inode_parent_has_case_insensitive_not_set,		317,	FSCK_AUTOFIX)	\
 	x(vfs_inode_i_blocks_underflow,				311,	FSCK_AUTOFIX)	\
 	x(vfs_inode_i_blocks_not_zero_at_truncate,		313,	FSCK_AUTOFIX)	\
+	x(vfs_bad_inode_rm,					320,	0)		\
 	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
 	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
 	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
@@ -329,7 +330,7 @@ enum bch_fsck_flags {
 	x(dirent_stray_data_after_cf_name,			305,	0)		\
 	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\
 	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\
-	x(MAX,							320,	0)
+	x(MAX,							321,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,

From 29cc6fb7c068c773049d3bde14b939033893eff4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 2 Jun 2025 19:48:27 -0400
Subject: [PATCH 67/69] bcachefs: Fix subvol to missing root repair

We had a bug where the root inode of a subvolume was erronously deleted:
bch2_evict_inode() called bch2_inode_rm(), meaning the VFS inode's
i_nlink was somehow set to 0 when it shouldn't have - the inode in the
btree indicated it clearly was not unlinked.

This has been addressed with additional safety checks in
bch2_inode_rm() - pulling in the safety checks we already were doing
when deleting unlinked inodes in recovery - but the really disastrous
bug was in check_subvols(), which on finding a dangling subvol (subvol
with a missing root inode) would delete the subvolume.

I assume this bug dates from early check_directory_structure() code,
which originally handled subvolumes and normal paths - the idea being
that still live contents of the subvolume would get reattached
somewhere.

But that's incorrect, and disastrously so; deleting a subvolume triggers
deleting the snapshot ID it points to, deleting the entire contents.

The correct way to repair is to recreate the root inode if it's missing;
then any contents will get reattached under that subvolume's lost+found.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 60c10365c285..020587449123 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -130,10 +130,20 @@ static int check_subvol(struct btree_trans *trans,
 			     "subvolume %llu points to missing subvolume root %llu:%u",
 			     k.k->p.offset, le64_to_cpu(subvol.v->inode),
 			     le32_to_cpu(subvol.v->snapshot))) {
-			ret = bch2_subvolume_delete(trans, iter->pos.offset);
-			bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
-			ret = ret ?: -BCH_ERR_transaction_restart_nested;
-			goto err;
+			/*
+			 * Recreate - any contents that are still disconnected
+			 * will then get reattached under lost+found
+			 */
+			bch2_inode_init_early(c, &inode);
+			bch2_inode_init_late(c, &inode, bch2_current_time(c),
+					     0, 0, S_IFDIR|0700, 0, NULL);
+			inode.bi_inum			= le64_to_cpu(subvol.v->inode);
+			inode.bi_snapshot		= le32_to_cpu(subvol.v->snapshot);
+			inode.bi_subvol			= k.k->p.offset;
+			inode.bi_parent_subvol		= le32_to_cpu(subvol.v->fs_path_parent);
+			ret = __bch2_fsck_write_inode(trans, &inode);
+			if (ret)
+				goto err;
 		}
 	} else {
 		goto err;

From a4b0f750505cc4ffdafca3e8e6bd731341c6bb68 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Jun 2025 09:31:58 -0400
Subject: [PATCH 68/69] bcachefs: Make journal read log message a bit quieter

Users seem to be assuming that the 'dropped unflushed entries' message
at the end of journal read indicates some sort of problem, when it does
not - we expect there to be entries in the journal that weren't
commited, it's purely informational so that we can correlate journal
sequence numbers elsewhere when debugging.

Shorten the log message a bit to hopefully make this clearer.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_io.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index d4c9980a6a7b..0b15d71a8d2d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1427,12 +1427,12 @@ int bch2_journal_read(struct bch_fs *c,
 		return 0;
 	}
 
-	bch_info(c, "journal read done, replaying entries %llu-%llu",
-		 *last_seq, *blacklist_seq - 1);
-
+	printbuf_reset(&buf);
+	prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
+		   *last_seq, *blacklist_seq - 1);
 	if (*start_seq != *blacklist_seq)
-		bch_info(c, "dropped unflushed entries %llu-%llu",
-			 *blacklist_seq, *start_seq - 1);
+		prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
+	bch_info(c, "%s", buf.buf);
 
 	/* Drop blacklisted entries and entries older than last_seq: */
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {

From 3d11125ff624b540334f7134d98b94f3b980e85d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 3 Jun 2025 11:47:51 -0400
Subject: [PATCH 69/69] bcachefs: add cond_resched() to handle_overwrites()

Fix soft lockup warnings in btree nodes can.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_node_scan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 5a97a6b8a757..a35847734a60 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -363,6 +363,8 @@ static int handle_overwrites(struct bch_fs *c,
 				min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
 			}
 		}
+
+		cond_resched();
 	}
 
 	return 0;