bcachefs: Allocator refactoring

This uses the kthread_wait_freezable() macro to simplify a lot of the allocator thread code, along with cleaning up bch2_invalidate_bucket2(). Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2026-05-03 18:12:25 -04:00 · 2021-04-17 20:37:04 -04:00
parent fa272f33bb
commit 89baec780f
3 changed files with 162 additions and 312 deletions
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -441,50 +441,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 * commands to the newly free buckets, then puts them on the various freelists.
 */

-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned long gc_count = c->gc_count;
-	s64 available;
-	int ret = 0;
-
-	ca->allocator_state = ALLOCATOR_blocked;
-	closure_wake_up(&c->freelist_wait);
-
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		if (gc_count != c->gc_count)
-			ca->inc_gen_really_needs_gc = 0;
-
-		available  = dev_buckets_reclaimable(ca);
-		available -= ca->inc_gen_really_needs_gc;
-
-		available = max(available, 0LL);
-
-		if (available)
-			break;
-
-		schedule();
-		try_to_freeze();
-	}
-
-	__set_current_state(TASK_RUNNING);
-	ca->allocator_state = ALLOCATOR_running;
-	closure_wake_up(&c->freelist_wait);
-
-	return ret;
-}
-
 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 				       struct bucket_mark m)
 {
@@ -502,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,

 	gc_gen = bucket_gc_gen(bucket(ca, b));

-	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
-		ca->inc_gen_needs_gc++;
-
-	if (gc_gen >= BUCKET_GC_GEN_MAX)
-		ca->inc_gen_really_needs_gc++;
+	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
+	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;

 	return gc_gen < BUCKET_GC_GEN_MAX;
 }
@@ -583,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 		struct bucket_mark m = READ_ONCE(g->mark);
 		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);

+		cond_resched();
+
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;

@@ -599,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 				.key	= key,
 			};
 		}
-
-		cond_resched();
 	}

 	if (e.nr)
@@ -693,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	size_t i, nr = 0;

 	ca->inc_gen_needs_gc			= 0;
+	ca->inc_gen_really_needs_gc		= 0;

 	switch (ca->mi.replacement) {
 	case BCH_CACHE_REPLACEMENT_lru:
@@ -714,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return nr;
 }

-static inline long next_alloc_bucket(struct bch_dev *ca)
-{
-	struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-	while (ca->alloc_heap.used) {
-		if (top->nr) {
-			size_t b = top->bucket;
-
-			top->bucket++;
-			top->nr--;
-			return b;
-		}
-
-		heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-	}
-
-	return -1;
-}
-
 /*
 * returns sequence number of most recent journal entry that updated this
 * bucket:
@@ -755,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
 	}
 }

-static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
-				       struct bch_dev *ca,
-				       struct btree_iter *iter,
-				       u64 *journal_seq, unsigned flags)
+static int bucket_invalidate_btree(struct btree_trans *trans,
+				   struct bch_dev *ca, u64 b)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_alloc_buf a;
+	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
-	bool invalidating_cached_data;
+	struct btree_iter *iter =
+		bch2_trans_get_iter(trans, BTREE_ID_alloc,
+				    POS(ca->dev_idx, b),
+				    BTREE_ITER_CACHED|
+				    BTREE_ITER_CACHED_NOFILL|
+				    BTREE_ITER_INTENT);
+	int ret;
+
+	a = bch2_trans_kmalloc(trans, sizeof(*a));
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto err;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		goto err;
+
+	percpu_down_read(&c->mark_lock);
+	g = bucket(ca, b);
+	m = READ_ONCE(g->mark);
+	u = alloc_mem_to_key(iter, g, m);
+	percpu_up_read(&c->mark_lock);
+
+	u.gen++;
+	u.data_type	= 0;
+	u.dirty_sectors	= 0;
+	u.cached_sectors = 0;
+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
+
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
+err:
+	bch2_trans_iter_put(trans, iter);
+	return ret;
+}
+
+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				      u64 *journal_seq, unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark m;
 	size_t b;
 	int ret = 0;

@@ -811,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 		goto out;
 	}

-	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
-retry:
-	ret = bch2_btree_iter_traverse(iter);
-	if (ret)
-		return ret;
-
-	percpu_down_read(&c->mark_lock);
-	g = bucket(ca, iter->pos.offset);
-	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(iter, g, m);
-
-	percpu_up_read(&c->mark_lock);
-
-	invalidating_cached_data = u.cached_sectors != 0;
-
-	u.gen++;
-	u.data_type	= 0;
-	u.dirty_sectors	= 0;
-	u.cached_sectors = 0;
-	u.read_time	= atomic64_read(&c->io_clock[READ].now);
-	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
-
-	bch2_alloc_pack(c, &a, u);
-	bch2_trans_update(trans, iter, &a.k,
-			  BTREE_TRIGGER_BUCKET_INVALIDATE);
-
-	/*
-	 * XXX:
-	 * when using deferred btree updates, we have journal reclaim doing
-	 * btree updates and thus requiring the allocator to make forward
-	 * progress, and here the allocator is requiring space in the journal -
-	 * so we need a journal pre-reservation:
-	 */
-	ret = bch2_trans_commit(trans, NULL,
-				invalidating_cached_data ? journal_seq : NULL,
-				BTREE_INSERT_NOUNLOCK|
-				BTREE_INSERT_NOCHECK_RW|
-				BTREE_INSERT_NOFAIL|
-				BTREE_INSERT_JOURNAL_RESERVED|
-				flags);
-	if (ret == -EINTR)
-		goto retry;
+	ret = bch2_trans_do(c, NULL, journal_seq,
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_NOFAIL|
+			    BTREE_INSERT_JOURNAL_RESERVED|
+			    flags,
+			    bucket_invalidate_btree(&trans, ca, b));
 out:
 	if (!ret) {
 		/* remove from alloc_heap: */
@@ -894,28 +832,23 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 */
 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct btree_trans trans;
-	struct btree_iter *iter;
 	u64 journal_seq = 0;
 	int ret = 0;

-	bch2_trans_init(&trans, c, 0, 0);
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
-				   POS(ca->dev_idx, 0),
-				   BTREE_ITER_CACHED|
-				   BTREE_ITER_CACHED_NOFILL|
-				   BTREE_ITER_INTENT);
-
 	/* Only use nowait if we've already invalidated at least one bucket: */
 	while (!ret &&
 	       !fifo_full(&ca->free_inc) &&
-	       ca->alloc_heap.used)
-		ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
+	       ca->alloc_heap.used) {
+		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
 				(!fifo_empty(&ca->free_inc)
 				 ? BTREE_INSERT_NOWAIT : 0));
-
-	bch2_trans_iter_put(&trans, iter);
-	bch2_trans_exit(&trans);
+		/*
+		 * We only want to batch up invalidates when they're going to
+		 * require flushing the journal:
+		 */
+		if (!journal_seq)
+			break;
+	}

 	/* If we used NOWAIT, don't return the error: */
 	if (!fifo_empty(&ca->free_inc))
@@ -935,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 	return 0;
 }

-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
+{
+	if (ca->allocator_state != new_state) {
+		ca->allocator_state = new_state;
+		closure_wake_up(&ca->fs->freelist_wait);
+	}
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
 	unsigned i;
 	int ret = 0;

-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++) {
+		/*
+		 * Don't strand buckets on the copygc freelist until
+		 * after recovery is finished:
+		 */
+		if (i == RESERVE_MOVINGGC &&
+		    !test_bit(BCH_FS_STARTED, &c->flags))
+			continue;

-		spin_lock(&c->freelist_lock);
-		for (i = 0; i < RESERVE_NR; i++) {
-
-			/*
-			 * Don't strand buckets on the copygc freelist until
-			 * after recovery is finished:
-			 */
-			if (!test_bit(BCH_FS_STARTED, &c->flags) &&
-			    i == RESERVE_MOVINGGC)
-				continue;
-
-			if (fifo_push(&ca->free[i], bucket)) {
-				fifo_pop(&ca->free_inc, bucket);
-
-				closure_wake_up(&c->freelist_wait);
-				ca->allocator_state = ALLOCATOR_running;
-
-				spin_unlock(&c->freelist_lock);
-				goto out;
-			}
-		}
-
-		if (ca->allocator_state != ALLOCATOR_blocked_full) {
-			ca->allocator_state = ALLOCATOR_blocked_full;
-			closure_wake_up(&c->freelist_wait);
-		}
-
-		spin_unlock(&c->freelist_lock);
-
-		if ((current->flags & PF_KTHREAD) &&
-		    kthread_should_stop()) {
+		if (fifo_push(&ca->free[i], b)) {
+			fifo_pop(&ca->free_inc, b);
 			ret = 1;
 			break;
 		}
-
-		schedule();
-		try_to_freeze();
 	}
-out:
-	__set_current_state(TASK_RUNNING);
+	spin_unlock(&c->freelist_lock);
+
+	ca->allocator_state = ret
+		? ALLOCATOR_running
+		: ALLOCATOR_blocked_full;
+	closure_wake_up(&c->freelist_wait);
 	return ret;
 }

-/*
- * Pulls buckets off free_inc, discards them (if enabled), then adds them to
- * freelists, waiting until there's room if necessary:
- */
-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
 {
-	while (!fifo_empty(&ca->free_inc)) {
-		size_t bucket = fifo_peek(&ca->free_inc);
-
-		if (ca->mi.discard &&
-		    bdev_max_discard_sectors(ca->disk_sb.bdev))
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     bucket_to_sector(ca, bucket),
-					     ca->mi.bucket_size, GFP_NOIO);
-
-		if (push_invalidated_bucket(c, ca, bucket))
-			return 1;
-	}
-
-	return 0;
+	if (ca->mi.discard &&
+	    bdev_max_discard_sectors(ca->disk_sb.bdev))
+		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
+				     ca->mi.bucket_size, GFP_NOFS);
 }

-static inline bool allocator_thread_running(struct bch_dev *ca)
+static bool allocator_thread_running(struct bch_dev *ca)
 {
-	return ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
+		? ALLOCATOR_running
+		: ALLOCATOR_stopped;
+	alloc_thread_set_state(ca, state);
+	return state == ALLOCATOR_running;
+}
+
+static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
+{
+	s64 available = dev_buckets_reclaimable(ca) -
+		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
+	bool ret = available > 0;
+
+	alloc_thread_set_state(ca, ret
+			       ? ALLOCATOR_running
+			       : ALLOCATOR_blocked);
+	return ret;
 }

 /**
@@ -1026,56 +948,29 @@ static int bch2_allocator_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
+	unsigned long gc_count = c->gc_count;
 	size_t nr;
 	int ret;

 	set_freezable();

 	while (1) {
-		if (!allocator_thread_running(ca)) {
-			ca->allocator_state = ALLOCATOR_stopped;
-			if (kthread_wait_freezable(allocator_thread_running(ca)))
-				break;
-		}
-
-		ca->allocator_state = ALLOCATOR_running;
-
-		cond_resched();
-		if (kthread_should_stop())
-			break;
-
-		pr_debug("discarding %zu invalidated buckets",
-			 fifo_used(&ca->free_inc));
-
-		ret = discard_invalidated_buckets(c, ca);
+		ret = kthread_wait_freezable(allocator_thread_running(ca));
 		if (ret)
 			goto stop;

-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret)
-			goto stop;
-
-		if (!fifo_empty(&ca->free_inc))
-			continue;
-
-		pr_debug("free_inc now empty");
-
-		while (1) {
+		while (!ca->alloc_heap.used) {
 			cond_resched();
-			/*
-			 * Find some buckets that we can invalidate, either
-			 * they're completely unused, or only contain clean data
-			 * that's been written back to the backing device or
-			 * another cache tier
-			 */

-			pr_debug("scanning for reclaimable buckets");
+			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
+			if (ret)
+				goto stop;

+			gc_count = c->gc_count;
 			nr = find_reclaimable_buckets(c, ca);

-			pr_debug("found %zu buckets", nr);
-
-			trace_alloc_batch(ca, nr, ca->alloc_heap.size);
+			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+					 ca->inc_gen_really_needs_gc);

 			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
 			     ca->inc_gen_really_needs_gc) &&
@@ -1083,33 +978,24 @@ static int bch2_allocator_thread(void *arg)
 				atomic_inc(&c->kick_gc);
 				wake_up_process(c->gc_thread);
 			}
+		}

-			if (nr)
-				break;
+		ret = bch2_invalidate_buckets(c, ca);
+		if (ret)
+			goto stop;

-			/*
-			 * If we found any buckets, we have to invalidate them
-			 * before we scan for more - but if we didn't find very
-			 * many we may want to wait on more buckets being
-			 * available so we don't spin:
-			 */
-			ret = wait_buckets_available(c, ca);
+		while (!fifo_empty(&ca->free_inc)) {
+			u64 b = fifo_peek(&ca->free_inc);
+
+			discard_one_bucket(c, ca, b);
+
+			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
 			if (ret)
 				goto stop;
 		}
-
-		pr_debug("%zu buckets to invalidate", nr);
-
-		/*
-		 * alloc_heap is now full of newly-invalidated buckets: next,
-		 * write out the new bucket gens:
-		 */
 	}
-
 stop:
-	pr_debug("alloc thread stopping (ret %i)", ret);
-	ca->allocator_state = ALLOCATOR_stopped;
-	closure_wake_up(&c->freelist_wait);
+	alloc_thread_set_state(ca, ALLOCATOR_stopped);
 	return 0;
 }

--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1,57 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Primary bucket allocation code
- *
 * Copyright 2012 Google, Inc.
 *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
 *
 * bch2_bucket_alloc() allocates a single bucket from a specific device.
 *
 * bch2_bucket_alloc_set() allocates one or more buckets from different devices
 * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch2_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
 */

 #include "bcachefs.h"
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -380,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,

 /* Allocator */

-TRACE_EVENT(alloc_batch,
-	TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
-	TP_ARGS(ca, free, total),
+TRACE_EVENT(alloc_scan,
+	TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
+	TP_ARGS(ca, found, inc_gen, inc_gen_skipped),

 	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(size_t,		free		)
-		__field(size_t,		total		)
+		__field(dev_t,		dev		)
+		__field(u64,		found		)
+		__field(u64,		inc_gen		)
+		__field(u64,		inc_gen_skipped	)
 	),

 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->free = free;
-		__entry->total = total;
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->found		= found;
+		__entry->inc_gen	= inc_gen;
+		__entry->inc_gen_skipped = inc_gen_skipped;
 	),

-	TP_printk("%pU free %zu total %zu",
-		__entry->uuid, __entry->free, __entry->total)
+	TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
 );

 TRACE_EVENT(invalidate,
@@ -417,8 +420,10 @@ TRACE_EVENT(invalidate,
 	),

 	TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-		  __entry->sectors, MAJOR(__entry->dev),
-		  MINOR(__entry->dev), __entry->offset)
+		  __entry->sectors,
+		  MAJOR(__entry->dev),
+		  MINOR(__entry->dev),
+		  __entry->offset)
 );

 DECLARE_EVENT_CLASS(bucket_alloc,
@@ -426,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc,
 	TP_ARGS(ca, reserve),

 	TP_STRUCT__entry(
-		__array(char,			uuid,	16)
-		__field(enum alloc_reserve,	reserve	  )
+		__field(dev_t,			dev	)
+		__field(enum alloc_reserve,	reserve	)
 	),

 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
-		__entry->reserve = reserve;
+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
+		__entry->reserve	= reserve;
 	),

-	TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+	TP_printk("%d,%d reserve %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->reserve)
 );

 DEFINE_EVENT(bucket_alloc, bucket_alloc,