bcachefs: Scrub

Add a new data op to walk all data and metadata in a filesystem,
checking if it can be read successfully, and on error repairing from
another copy if possible.

- New helper: bch2_dev_idx_is_online(), so that we can bail out and
  report to userspace when we're unable to scrub because the device is
  offline

- data_update_opts, which controls the data move path, now understands
  scrub: data is only read, not written. The read path is responsible
  for rewriting on read error, as with other reads.

- scrub_pred skips data extents that don't have checksums

- bch_ioctl_data has a new scrub member, which has a data_types field
  for data types to check - i.e. all data types, or only metadata.

- Add new entries to bch_move_stats so that we can report numbers for
  corrected and uncorrected errors

- Add a new enum to bch_ioctl_data_event for explicitly reporting
  completion and return code (i.e. device offline)

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet
2024-12-28 19:59:55 -05:00
parent 3e2ad29865
commit f269ae55d2
9 changed files with 239 additions and 61 deletions

View File

@@ -214,6 +214,10 @@ struct bch_ioctl_data {
struct bpos end_pos;
union {
struct {
__u32 dev;
__u32 data_types;
} scrub;
struct {
__u32 dev;
__u32 pad;
@@ -238,11 +242,19 @@ struct bch_ioctl_data_progress {
__u64 sectors_done;
__u64 sectors_total;
__u64 sectors_error_corrected;
__u64 sectors_error_uncorrected;
} __packed __aligned(8);
enum bch_ioctl_data_event_ret {
BCH_IOCTL_DATA_EVENT_RET_done = 1,
BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
};
struct bch_ioctl_data_event {
__u8 type;
__u8 pad[7];
__u8 ret;
__u8 pad[6];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];

View File

@@ -313,7 +313,10 @@ static int bch2_data_thread(void *arg)
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
ctx->stats.done = true;
if (ctx->thr.ret == -BCH_ERR_device_offline)
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
else
ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
return 0;
}
@@ -332,14 +335,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.done ? U8_MAX : ctx->stats.data_type,
.p.btree_id = ctx->stats.pos.btree,
.p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
.type = BCH_DATA_EVENT_PROGRESS,
.ret = ctx->stats.ret,
.p.data_type = ctx->stats.data_type,
.p.btree_id = ctx->stats.pos.btree,
.p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
.p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
};
if (ctx->arg.op == BCH_DATA_OP_scrub) {
struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
if (ca) {
struct bch_dev_usage u;
bch2_dev_usage_read_fast(ca, &u);
for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
if (ctx->arg.scrub.data_types & BIT(i))
e.p.sectors_total += u.d[i].sectors;
bch2_dev_put(ca);
}
} else {
e.p.sectors_total = bch2_fs_usage_read_short(c).used;
}
if (len < sizeof(e))
return -EINVAL;

View File

@@ -673,12 +673,46 @@ static bool can_allocate_without_blocking(struct bch_fs *c,
return nr_replicas >= m->op.nr_replicas;
}
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
struct bch_io_opts *io_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
/* write path might have to decompress data: */
unsigned buf_bytes = 0;
bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
if (!m->bvecs)
return -ENOMEM;
bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
kfree(m->bvecs);
m->bvecs = NULL;
return -ENOMEM;
}
rbio_init(&m->rbio.bio, c, *io_opts, NULL);
m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
return 0;
}
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
struct bch_io_opts *io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
@@ -705,7 +739,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
bch2_write_op_init(&m->op, c, *io_opts);
m->op.pos = bkey_start_pos(k.k);
m->op.version = k.k->bversion;
m->op.target = data_opts.target;
@@ -716,7 +750,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_data_encoded|
BCH_WRITE_move|
m->data_opts.write_flags;
m->op.compression_opt = io_opts.background_compression;
m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
unsigned durability_have = 0, durability_removing = 0;
@@ -754,7 +788,7 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
* If current extent durability is less than io_opts.data_replicas,
@@ -787,7 +821,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
if (!ret)
ret = -BCH_ERR_data_update_done_no_writes_needed;
goto out_bkey_buf_exit;
@@ -825,33 +859,11 @@ int bch2_data_update_init(struct btree_trans *trans,
goto out_nocow_unlock;
}
/* write path might have to decompress data: */
unsigned buf_bytes = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
if (!m->bvecs)
goto enomem;
bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL))
goto enomem;
rbio_init(&m->rbio.bio, c, io_opts, NULL);
m->rbio.bio.bi_iter.bi_size = buf_bytes;
m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
ret = bch2_data_update_bios_init(m, c, io_opts);
if (ret)
goto out_nocow_unlock;
return 0;
enomem:
ret = -ENOMEM;
kfree(m->bvecs);
m->bvecs = NULL;
out_nocow_unlock:
if (c->opts.nocow_enabled)
bkey_nocow_unlock(c, k);

View File

@@ -16,6 +16,9 @@ struct data_update_opts {
u8 extra_replicas;
unsigned btree_insert_flags;
unsigned write_flags;
int read_dev;
bool scrub;
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
@@ -48,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *,
struct bch_io_opts *,
struct data_update_opts *);
int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
struct bch_io_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
struct bch_io_opts, struct data_update_opts,
struct bch_io_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);

View File

@@ -243,7 +243,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
orig->opts,
&orig->opts,
update_opts,
btree_id, k);
/*
@@ -488,6 +488,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
blk_status_t error)
{
rbio->retry = retry;
rbio->saw_error = true;
if (rbio->flags & BCH_READ_in_retry)
return;
@@ -969,6 +970,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
BUG();
if (ca)
percpu_ref_put(&ca->io_ref);
goto hole;

View File

@@ -41,6 +41,7 @@ struct bch_read_bio {
have_ioref:1,
narrow_crcs:1,
hole:1,
saw_error:1,
retry:2,
context:2;
};

View File

@@ -89,7 +89,12 @@ static void move_free(struct moving_io *io)
wake_up(&ctxt->wait);
mutex_unlock(&ctxt->lock);
bch2_data_update_exit(&io->write);
if (!io->write.data_opts.scrub) {
bch2_data_update_exit(&io->write);
} else {
bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
kfree(io->write.bvecs);
}
kfree(io);
}
@@ -109,7 +114,20 @@ static void move_write_done(struct bch_write_op *op)
static void move_write(struct moving_io *io)
{
if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) {
struct moving_context *ctxt = io->write.ctxt;
if (ctxt->stats) {
if (io->write.rbio.bio.bi_status)
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
&ctxt->stats->sectors_error_uncorrected);
else if (io->write.rbio.saw_error)
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
&ctxt->stats->sectors_error_corrected);
}
if (unlikely(io->write.rbio.bio.bi_status ||
io->write.rbio.hole ||
io->write.data_opts.scrub)) {
move_free(io);
return;
}
@@ -263,7 +281,8 @@ int bch2_move_extent(struct moving_context *ctxt,
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas) {
!data_opts.extra_replicas &&
!data_opts.scrub) {
if (data_opts.kill_ptrs)
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
@@ -284,16 +303,28 @@ int bch2_move_extent(struct moving_context *ctxt,
io->read_sectors = k.k->size;
io->write_sectors = k.k->size;
ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
io_opts, data_opts, iter->btree_id, k);
if (ret)
goto err_free;
if (!data_opts.scrub) {
ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
&io_opts, data_opts, iter->btree_id, k);
if (ret)
goto err_free;
io->write.op.end_io = move_write_done;
} else {
bch2_bkey_buf_init(&io->write.k);
bch2_bkey_buf_reassemble(&io->write.k, c, k);
io->write.op.c = c;
io->write.data_opts = data_opts;
ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
if (ret)
goto err_free;
}
io->write.rbio.bio.bi_end_io = move_read_endio;
io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
io->write.op.end_io = move_write_done;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
@@ -324,11 +355,14 @@ int bch2_move_extent(struct moving_context *ctxt,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->write.rbio,
bkey_start_pos(k.k),
iter->btree_id, k, 0,
BCH_READ_data_update|
BCH_READ_last_fragment);
__bch2_read_extent(trans, &io->write.rbio,
io->write.rbio.bio.bi_iter,
bkey_start_pos(k.k),
iter->btree_id, k, 0,
NULL,
BCH_READ_data_update|
BCH_READ_last_fragment,
data_opts.scrub ? data_opts.read_dev : -1);
return 0;
err_free:
kfree(io);
@@ -669,6 +703,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
unsigned dev,
u64 bucket_start,
u64 bucket_end,
unsigned data_types,
move_pred_fn pred, void *arg)
{
struct btree_trans *trans = ctxt->trans;
@@ -737,6 +772,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (ctxt->stats)
ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
if (!(data_types & BIT(bp.v->data_type)))
goto next;
k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -760,17 +798,25 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
goto next;
}
if (data_opts.scrub &&
!bch2_dev_idx_is_online(c, data_opts.read_dev)) {
bch2_trans_iter_exit(trans, &iter);
ret = -BCH_ERR_device_offline;
break;
}
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
/* move_extent will drop locks */
unsigned sectors = !bp.v->level
? bp.v->bucket_len
: btree_ptr_sectors_written(k);
unsigned sectors = bp.v->bucket_len;
ret = !bp.v->level
? bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts)
: bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
if (!bp.v->level)
ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
else if (!data_opts.scrub)
ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
else
ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
bch2_trans_iter_exit(trans, &iter);
@@ -797,6 +843,30 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
return ret;
}
static int bch2_move_data_phys(struct bch_fs *c,
unsigned dev,
u64 start,
u64 end,
unsigned data_types,
struct bch_ratelimit *rate,
struct bch_move_stats *stats,
struct write_point_specifier wp,
bool wait_on_copygc,
move_pred_fn pred, void *arg)
{
struct moving_context ctxt;
bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
ctxt.stats->phys = true;
int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
struct evacuate_bucket_arg {
struct bpos bucket;
int gen;
@@ -834,6 +904,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bucket.inode,
bucket.offset,
bucket.offset + 1,
~0,
evacuate_bucket_pred, &arg);
}
@@ -1075,6 +1146,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
static bool scrub_pred(struct bch_fs *c, void *_arg,
struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_ioctl_data *arg = _arg;
if (k.k->type != KEY_TYPE_btree_ptr_v2) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev == arg->migrate.dev) {
if (!p.crc.csum_type)
return false;
break;
}
}
data_opts->scrub = true;
data_opts->read_dev = arg->migrate.dev;
return true;
}
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
@@ -1089,6 +1184,22 @@ int bch2_data_job(struct bch_fs *c,
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
switch (op.op) {
case BCH_DATA_OP_scrub:
/*
* prevent tests from spuriously failing, make sure we see all
* btree nodes that need to be repaired
*/
bch2_btree_interior_updates_flush(c);
ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
op.scrub.data_types,
NULL,
stats,
writepoint_hashed((unsigned long) current),
false,
scrub_pred, &op) ?: ret;
break;
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);

View File

@@ -3,11 +3,12 @@
#define _BCACHEFS_MOVE_TYPES_H
#include "bbpos_types.h"
#include "bcachefs_ioctl.h"
struct bch_move_stats {
char name[32];
bool phys;
bool done;
enum bch_ioctl_data_event_ret ret;
union {
struct {
@@ -25,6 +26,8 @@ struct bch_move_stats {
atomic64_t sectors_seen;
atomic64_t sectors_moved;
atomic64_t sectors_raced;
atomic64_t sectors_error_corrected;
atomic64_t sectors_error_uncorrected;
};
struct move_bucket_key {

View File

@@ -23,6 +23,18 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
return !percpu_ref_is_zero(&ca->io_ref);
}
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
{
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
bool ret = ca && bch2_dev_is_online(ca);
rcu_read_unlock();
return ret;
}
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&