bcachefs: "Journal stuck" timeout now takes into account device latency

If a block device (e.g. your typical consumer SSD) is taking multiple
seconds for IOs (typically flushes), we don't want to emit the "journal
stuck" message prematurely.

Also, make sure to drop the btree_trans srcu lock if we're blocking for
more than a second.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet
2025-01-21 17:42:25 -05:00
parent f917016f69
commit 2c5d8a8347
3 changed files with 33 additions and 8 deletions

View File

@@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
trans->journal_u64s, flags);
trans->journal_u64s, flags, trans);
}
#define JSET_ENTRY_LOG_U64s 4

View File

@@ -601,6 +601,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
: -BCH_ERR_journal_res_get_blocked;
}
static unsigned max_dev_latency(struct bch_fs *c)
{
u64 nsecs = 0;
for_each_rw_member(c, ca)
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
return nsecs_to_jiffies(nsecs);
}
/*
* Essentially the entry function to the journaling code. When bcachefs is doing
* a btree insert, it calls this function to get the current journal write.
@@ -612,17 +622,31 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
* btree node write locks.
*/
int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
unsigned flags)
unsigned flags,
struct btree_trans *trans)
{
int ret;
if (closure_wait_event_timeout(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK),
HZ * 10))
HZ))
return ret;
if (trans)
bch2_trans_unlock_long(trans);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
remaining_wait = max(0, remaining_wait - HZ);
if (closure_wait_event_timeout(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK),
remaining_wait))
return ret;
struct printbuf buf = PRINTBUF;
bch2_journal_debug_to_text(&buf, j);
bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
@@ -727,7 +751,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
* livelock:
*/
sched_annotate_sleep();
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
if (ret)
return ret;
@@ -848,7 +872,7 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
static int __bch2_journal_meta(struct journal *j)
{
struct journal_res res = {};
int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
if (ret)
return ret;

View File

@@ -312,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j,
}
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned);
unsigned, struct btree_trans *);
/* First bits for BCH_WATERMARK: */
enum journal_res_flags {
@@ -368,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j,
}
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s, unsigned flags)
unsigned u64s, unsigned flags,
struct btree_trans *trans)
{
int ret;
@@ -380,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
if (journal_res_get_fast(j, res, flags))
goto out;
ret = bch2_journal_res_get_slowpath(j, res, flags);
ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
if (ret)
return ret;
out: