bcachefs: bch2_write_op_error() now prints info about data update

A user has been seeing the "error verifying existing checksum while
rewriting existing data (memory corruption?)" error.

This generally indicates a hardware issue (and that may be the case
here), but it might also indicate a bug, in which case we need more
information to look for patterns.

Reported-by: Roland Vet <vet.roland@protonmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet
2025-02-10 17:04:08 -05:00
parent 3faa4647a0
commit 1ccbcd3205
5 changed files with 80 additions and 35 deletions

View File

@@ -271,8 +271,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc->compressed_size << 9 > c->opts.encoded_extent_max) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "error rewriting existing data: extent too big");
bch2_write_op_error(&buf, op, op->pos.offset,
"extent too big to decompress");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
return -EIO;
@@ -283,8 +283,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
if (__bio_uncompress(c, bio, data.b, *crc)) {
if (!c->opts.no_data_io) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "error rewriting existing data: decompression error");
bch2_write_op_error(&buf, op, op->pos.offset,
"decompression error");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}

View File

@@ -580,3 +580,9 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb
prt_printf(out, " offset %llu: ", pos.offset << 8);
return 0;
}
void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
struct bpos pos)
{
bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
}

View File

@@ -243,5 +243,6 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subv
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
#endif /* _BCACHEFS_ERROR_H */

View File

@@ -396,29 +396,61 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
u64 offset)
void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_write_op *op, u64 offset, const char *fmt, ...)
{
bch2_inum_offset_err_msg(op->c, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
prt_printf(out, "write error%s: ",
op->flags & BCH_WRITE_move ? "(internal move)" : "");
if (op->subvol)
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9));
else {
struct bpos pos = op->pos;
pos.offset = offset;
lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
}
prt_str(out, "write error: ");
va_list args;
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op);
prt_printf(out, "\n from internal move ");
bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
}
}
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset,
const char *fmt, ...)
{
__bch2_write_op_error(out, op, op->pos.offset);
}
if (op->subvol)
bch2_inum_offset_err_msg(op->c, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
else {
struct bpos pos = op->pos;
pos.offset = offset;
bch2_inum_snap_offset_err_msg(op->c, out, pos);
}
static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_write_op *op, u64 offset)
{
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
prt_printf(out, "write error%s: ",
op->flags & BCH_WRITE_move ? "(internal move)" : "");
prt_str(out, "write error: ");
va_list args;
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
if (op->flags & BCH_WRITE_move) {
struct data_update *u = container_of(op, struct data_update, op);
prt_printf(out, "\n from internal move ");
bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
}
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -561,8 +593,8 @@ static void __bch2_write_index(struct bch_write_op *op)
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
struct printbuf buf = PRINTBUF;
__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k),
"btree update error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@@ -1114,8 +1146,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
csum_err:
{
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
bch2_write_op_error(&buf, op, op->pos.offset,
"error verifying existing checksum while rewriting existing data (memory corruption?)");
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@@ -1211,8 +1243,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
struct printbuf buf = PRINTBUF;
bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k),
"btree update error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@@ -1379,8 +1411,8 @@ static void bch2_nocow_write(struct bch_write_op *op)
if (ret) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
bch2_write_op_error(&buf, op, op->pos.offset,
"%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
op->error = ret;
@@ -1502,8 +1534,8 @@ static void __bch2_write(struct bch_write_op *op)
if (unlikely(ret < 0)) {
if (!(op->flags & BCH_WRITE_alloc_nowait)) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
bch2_write_op_error(&buf, op, op->pos.offset,
"%s(): %s", __func__, bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
}
@@ -1634,8 +1666,8 @@ CLOSURE_CALLBACK(bch2_write)
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
struct printbuf buf = PRINTBUF;
bch2_write_op_error(&buf, op);
prt_printf(&buf, "misaligned write");
bch2_write_op_error(&buf, op, op->pos.offset,
"misaligned write");
printbuf_exit(&buf);
op->error = -EIO;
goto err;

View File

@@ -20,7 +20,13 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
__printf(5, 6)
void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_write_op *op, u64, const char *, ...);
__printf(4, 5)
void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64,
const char *, ...);
#define BCH_WRITE_FLAGS() \
x(alloc_nowait) \