diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d56057f27e8f..c14117227dd7 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -814,11 +814,8 @@ mempool_t bio_bounce_pages; struct mutex verify_lock; #endif - struct mutex inode_create_lock; - unsigned unused_inodes_nr; - u64 unused_inodes[64]; - u32 unused_inodes_gens[64]; - u64 unused_inode_hint; + u64 *unused_inode_hints; + unsigned inode_shard_bits; /* * A btree node on disk could have too many bsets for an iterator to fit diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index c55c164be882..b49c382f5452 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -361,55 +361,6 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -static int scan_free_inums(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = NULL; - struct bkey_s_c k; - u64 min = BLOCKDEV_INODE_MAX; - u64 max = c->opts.inodes_32bit - ? S32_MAX : S64_MAX; - u64 start = max(min, READ_ONCE(c->unused_inode_hint)); - int ret = 0; - - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, start), - BTREE_ITER_SLOTS); - if (IS_ERR(iter)) - return PTR_ERR(iter); -again: - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { - if (bkey_cmp(iter->pos, POS(0, max)) > 0) - break; - - /* - * This doesn't check the btree key cache, but we don't care: - * we have to recheck with an intent lock held on the slot we're - * inserting to anyways: - */ - if (k.k->type != KEY_TYPE_inode) { - if (c->unused_inodes_nr < ARRAY_SIZE(c->unused_inodes)) { - c->unused_inodes[c->unused_inodes_nr] = k.k->p.offset; - c->unused_inodes_gens[c->unused_inodes_nr] = bkey_generation(k); - c->unused_inodes_nr++; - } - - if (c->unused_inodes_nr == ARRAY_SIZE(c->unused_inodes)) - goto out; - } - } - - if (!ret && start != min) { - max = start; - start = min; - bch2_btree_iter_set_pos(iter, POS(0, start)); - goto again; - } -out: - c->unused_inode_hint = iter->pos.offset; - bch2_trans_iter_put(trans, iter); - return ret; -} - int bch2_inode_create(struct btree_trans *trans, struct bch_inode_unpacked *inode_u) { @@ -417,64 +368,68 @@ int bch2_inode_create(struct btree_trans *trans, struct bkey_inode_buf *inode_p; struct btree_iter *iter = NULL; struct bkey_s_c k; - u64 inum; - u32 generation; - int ret = 0; + u64 min, max, start, *hint; + int ret; + + unsigned cpu = raw_smp_processor_id(); + unsigned bits = (c->opts.inodes_32bit + ? 31 : 63) - c->inode_shard_bits; + + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); + + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; + + start = READ_ONCE(*hint); + + if (start >= max || start < min) + start = min; inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); if (IS_ERR(inode_p)) return PTR_ERR(inode_p); +again: + for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(iter->pos, POS(0, max)) > 0) + break; - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS_MIN, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); -retry: - if (!mutex_trylock(&c->inode_create_lock)) { - bch2_trans_unlock(trans); - mutex_lock(&c->inode_create_lock); - if (!bch2_trans_relock(trans)) { - mutex_unlock(&c->inode_create_lock); - ret = -EINTR; - goto err; - } + /* + * There's a potential cache coherency issue with the btree key + * cache code here - we're iterating over the btree, skipping + * that cache. We should never see an empty slot that isn't + * actually empty due to a pending update in the key cache + * because the update that creates the inode isn't done with a + * cached iterator, but - better safe than sorry, check the + * cache before using a slot: + */ + if (k.k->type != KEY_TYPE_inode && + !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos)) + goto found_slot; } - if (!c->unused_inodes_nr) - ret = scan_free_inums(trans); - if (!ret && !c->unused_inodes_nr) - ret = -ENOSPC; - if (!ret) { - --c->unused_inodes_nr; - inum = c->unused_inodes[c->unused_inodes_nr]; - generation = c->unused_inodes_gens[c->unused_inodes_nr]; + bch2_trans_iter_put(trans, iter); + + if (ret) + return ret; + + if (start != min) { + /* Retry from start */ + start = min; + goto again; } - mutex_unlock(&c->inode_create_lock); - - if (ret) - goto err; - - bch2_btree_iter_set_pos(iter, POS(0, inum)); - - /* Recheck that the slot is free with an intent lock held: */ - k = bch2_btree_iter_peek_cached(iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type == KEY_TYPE_inode) - goto retry; - - inode_u->bi_inum = inum; - inode_u->bi_generation = generation; + return -ENOSPC; +found_slot: + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); bch2_inode_pack(inode_p, inode_u); bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -err: bch2_trans_iter_put(trans, iter); - return ret; + return 0; } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index fd78ab205865..a2ade0df62b5 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -493,6 +493,7 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); if (c->journal_reclaim_wq) @@ -703,8 +704,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) seqcount_init(&c->usage_lock); - mutex_init(&c->inode_create_lock); - c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -746,6 +745,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", @@ -764,6 +765,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, + sizeof(u64), GFP_KERNEL)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) ||