From e60aa787f43f1e469632b8719e3e3ab1d9e7b720 Mon Sep 17 00:00:00 2001 From: Jonathan Lassoff Date: Mon, 11 Apr 2022 13:06:28 +1000 Subject: [PATCH 1/7] xfs: Simplify XFS logging methods. Rather than have a constructor to define many nearly-identical functions, use preprocessor macros to pass down a kernel logging level to a common function. Signed-off-by: Jonathan Lassoff Reviewed-by: Chris Down Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_message.c | 52 +++++++++++++++++++------------------------- fs/xfs/xfs_message.h | 49 +++++++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bc66d95c8d4c..9ceebd4c9ff1 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -27,37 +27,29 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -#define define_xfs_printk_level(func, kern_level) \ -void func(const struct xfs_mount *mp, const char *fmt, ...) \ -{ \ - struct va_format vaf; \ - va_list args; \ - int level; \ - \ - va_start(args, fmt); \ - \ - vaf.fmt = fmt; \ - vaf.va = &args; \ - \ - __xfs_printk(kern_level, mp, &vaf); \ - va_end(args); \ - \ - if (!kstrtoint(kern_level, 0, &level) && \ - level <= LOGLEVEL_ERR && \ - xfs_error_level >= XFS_ERRLEVEL_HIGH) \ - xfs_stack_trace(); \ -} \ +void +xfs_printk_level( + const char *kern_level, + const struct xfs_mount *mp, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; -define_xfs_printk_level(xfs_emerg, KERN_EMERG); -define_xfs_printk_level(xfs_alert, KERN_ALERT); -define_xfs_printk_level(xfs_crit, KERN_CRIT); -define_xfs_printk_level(xfs_err, KERN_ERR); -define_xfs_printk_level(xfs_warn, KERN_WARNING); -define_xfs_printk_level(xfs_notice, KERN_NOTICE); -define_xfs_printk_level(xfs_info, KERN_INFO); -#ifdef DEBUG -define_xfs_printk_level(xfs_debug, KERN_DEBUG); -#endif + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(kern_level, mp, &vaf); + + va_end(args); + + if (!kstrtoint(kern_level, 0, &level) && + level <= LOGLEVEL_ERR && + xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} void xfs_alert_tag( diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index bb9860ec9a93..a281b1cc13d5 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -6,32 +6,33 @@ struct xfs_mount; -extern __printf(2, 3) -void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, + const char *fmt, ...); + +#define xfs_emerg(mp, fmt, ...) \ + xfs_printk_level(KERN_EMERG, mp, fmt, ##__VA_ARGS__) +#define xfs_alert(mp, fmt, ...) \ + xfs_printk_level(KERN_ALERT, mp, fmt, ##__VA_ARGS__) +#define xfs_crit(mp, fmt, ...) \ + xfs_printk_level(KERN_CRIT, mp, fmt, ##__VA_ARGS__) +#define xfs_err(mp, fmt, ...) \ + xfs_printk_level(KERN_ERR, mp, fmt, ##__VA_ARGS__) +#define xfs_warn(mp, fmt, ...) \ + xfs_printk_level(KERN_WARNING, mp, fmt, ##__VA_ARGS__) +#define xfs_notice(mp, fmt, ...) \ + xfs_printk_level(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) +#define xfs_info(mp, fmt, ...) \ + xfs_printk_level(KERN_INFO, mp, fmt, ##__VA_ARGS__) +#ifdef DEBUG +#define xfs_debug(mp, fmt, ...) \ + xfs_printk_level(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) +#else +#define xfs_debug(mp, fmt, ...) do {} while (0) +#endif + extern __printf(3, 4) void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); -extern __printf(2, 3) -void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); -extern __printf(2, 3) -void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); - -#ifdef DEBUG -extern __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); -#else -static inline __printf(2, 3) -void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) -{ -} -#endif #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ From e270356944ccba0a9c23b4d0751d2e029184ad21 Mon Sep 17 00:00:00 2001 From: Jonathan Lassoff Date: Mon, 11 Apr 2022 13:06:39 +1000 Subject: [PATCH 2/7] xfs: Add XFS messages to printk index In order for end users to quickly react to new issues that come up in production, it is proving useful to leverage the printk indexing system. This printk index enables kernel developers to use calls to printk() with changeable format strings (as they always have; no change of expectations), while enabling end users to examine format strings to detect changes. Since end users are using regular expressions to match messages printed through printk(), being able to detect changes in chosen format strings from release to release provides a useful signal to review printk()-matching regular expressions for any necessary updates. So that detailed XFS messages are captures by this printk index, this patch wraps the xfs_ and xfs_alert_tag functions. Signed-off-by: Jonathan Lassoff Reviewed-by: Chris Down Reviewed-by: Petr Mladek Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_message.c | 2 +- fs/xfs/xfs_message.h | 29 ++++++++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 9ceebd4c9ff1..22c2adff1260 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -52,7 +52,7 @@ xfs_printk_level( } void -xfs_alert_tag( +_xfs_alert_tag( const struct xfs_mount *mp, int panic_tag, const char *fmt, ...) diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index a281b1cc13d5..035ee3d244ac 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -10,29 +10,40 @@ extern __printf(3, 4) void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp, const char *fmt, ...); +#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \ + xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \ +}) #define xfs_emerg(mp, fmt, ...) \ - xfs_printk_level(KERN_EMERG, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__) #define xfs_alert(mp, fmt, ...) \ - xfs_printk_level(KERN_ALERT, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__) #define xfs_crit(mp, fmt, ...) \ - xfs_printk_level(KERN_CRIT, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__) #define xfs_err(mp, fmt, ...) \ - xfs_printk_level(KERN_ERR, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__) #define xfs_warn(mp, fmt, ...) \ - xfs_printk_level(KERN_WARNING, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__) #define xfs_notice(mp, fmt, ...) \ - xfs_printk_level(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__) #define xfs_info(mp, fmt, ...) \ - xfs_printk_level(KERN_INFO, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__) #ifdef DEBUG #define xfs_debug(mp, fmt, ...) \ - xfs_printk_level(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) + xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__) #else #define xfs_debug(mp, fmt, ...) do {} while (0) #endif +#define xfs_alert_tag(mp, tag, fmt, ...) \ +({ \ + printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \ + _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \ +}) + extern __printf(3, 4) -void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +void _xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); #define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ From f3bf67c6c6fe863b7946ac0c2214a147dc50523d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 12 Apr 2022 06:49:40 +1000 Subject: [PATCH 3/7] xfs: Use generic_file_open() Remove the open-coded check of O_LARGEFILE. This changes the errno to be the same as other filesystems; it was changed generically in 2.6.24 but that fix skipped XFS. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5bddb1e9e0b3..c5541d062d0d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1167,12 +1167,10 @@ xfs_file_open( struct inode *inode, struct file *file) { - if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - return 0; + return generic_file_open(inode, file); } STATIC int From f34061f554feba68e12b7a73008c350d2a9afd0c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 06:49:41 +1000 Subject: [PATCH 4/7] xfs: pass explicit mount pointer to rtalloc query functions Pass an explicit xfs_mount pointer to the rtalloc query functions so that they can support transactionless queries. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_rtbitmap.c | 9 +++++---- fs/xfs/scrub/rtbitmap.c | 9 +++++---- fs/xfs/xfs_fsmap.c | 6 +++--- fs/xfs/xfs_rtalloc.h | 7 ++++--- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5740ba664867..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1008,6 +1008,7 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, @@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range( void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; xfs_rtblock_t high_key; @@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } @@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 8fa012057405..0a3bde64c675 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -40,6 +40,7 @@ xchk_setup_rt( /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -48,10 +49,10 @@ xchk_rtbitmap_rec( xfs_rtblock_t startblock; xfs_rtblock_t blockcount; - startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; - if (!xfs_verify_rtext(sc->mp, startblock, blockcount)) + if (!xfs_verify_rtext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -114,7 +115,7 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10e1cb71439e..bb23199f65c3 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -450,11 +450,11 @@ xfs_getfsmap_logdev( /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; @@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; - error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; @@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query( info->last = true; ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); - error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); if (error) goto err; err: diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 91b00289509b..539d134f4f25 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,6 +22,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -123,11 +124,11 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_trans *tp, +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); @@ -140,7 +141,7 @@ int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) From 5a605fd6cb1da0ec9cb6e54c06bcf58f706d2f83 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 06:49:42 +1000 Subject: [PATCH 5/7] xfs: recalculate free rt extents after log recovery I've been observing periodic corruption reports from xfs_scrub involving the free rt extent counter (frextents) while running xfs/141. That test uses an error injection knob to induce a torn write to the log, and an arbitrary number of recovery mounts, frextents will count fewer free rt extents than can be found the rtbitmap. The root cause of the problem is a combination of the misuse of sb_frextents in the incore mount to reflect both incore reservations made by running transactions as well as the actual count of free rt extents on disk. The following sequence can reproduce the undercount: Thread 1 Thread 2 xfs_trans_alloc(rtextents=3) xfs_mod_frextents(-3) xfs_attr_set() xfs_bmap_attr_addfork() xfs_add_attr2() xfs_log_sb() xfs_sb_to_disk() xfs_trans_commit() Note that thread 1 subtracts 3 from sb_frextents even though it never commits to using that space. Thread 2 writes the undercounted value to the ondisk superblock and logs it to the xattr transaction, which is then flushed to disk. At next mount, log recovery will find the logged superblock and write that back into the filesystem. At the end of log recovery, we reread the superblock and install the recovered undercounted frextents value into the incore superblock. From that point on, we've effectively leaked thread 1's transaction reservation. The correct fix for this is to separate the incore reservation from the ondisk usage, but that's a matter for the next patch. Because the kernel has been logging superblocks with undercounted frextents for a very long time and we don't demand that sysadmins run xfs_repair after a crash, fix the undercount by recomputing frextents after log recovery. Gating this on log recovery is a reasonable balance (I think) between correcting the problem and slowing down every mount attempt. Note that xfs_repair will fix undercounted frextents. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 41 ++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_rtalloc.c | 37 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_rtalloc.h | 2 ++ 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c5f153c3693f..53e130f803b1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -468,6 +468,8 @@ STATIC int xfs_check_summary_counts( struct xfs_mount *mp) { + int error = 0; + /* * The AG0 superblock verifier rejects in-progress filesystems, * so we should never see the flag set this far into mounting. @@ -506,11 +508,32 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && - !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) - return 0; + if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || + xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { + error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + if (error) + return error; + } - return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); + /* + * Older kernels misused sb_frextents to reflect both incore + * reservations made by running transactions and the actual count of + * free rt extents in the ondisk metadata. Transactions committed + * during runtime can therefore contain a superblock update that + * undercounts the number of free rt extents tracked in the rt bitmap. + * A clean unmount record will have the correct frextents value since + * there can be no other transactions running at that point. + * + * If we're mounting the rt volume after recovering the log, recompute + * frextents from the rtbitmap file to fix the inconsistency. + */ + if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + error = xfs_rtalloc_reinit_frextents(mp); + if (error) + return error; + } + + return 0; } /* @@ -784,11 +807,6 @@ xfs_mountfs( goto out_inodegc_shrinker; } - /* Make sure the summary counts are ok. */ - error = xfs_check_summary_counts(mp); - if (error) - goto out_log_dealloc; - /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); @@ -844,6 +862,11 @@ xfs_mountfs( goto out_rele_rip; } + /* Make sure the summary counts are ok. */ + error = xfs_check_summary_counts(mp); + if (error) + goto out_rtunmount; + /* * If this is a read-only mount defer the superblock updates until * the next remount into writeable mode. Otherwise we would never diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b8c79ee791af..76f50e75f99c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1284,6 +1284,43 @@ xfs_rtmount_init( return 0; } +static int +xfs_rtalloc_count_frextent( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + uint64_t *valp = priv; + + *valp += rec->ar_extcount; + return 0; +} + +/* + * Reinitialize the number of free realtime extents from the realtime bitmap. + * Callers must ensure that there is no other activity in the filesystem. + */ +int +xfs_rtalloc_reinit_frextents( + struct xfs_mount *mp) +{ + uint64_t val = 0; + int error; + + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, + &val); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_frextents = val; + spin_unlock(&mp->m_sb_lock); + return 0; +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 539d134f4f25..62c7ad79cbb6 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -135,6 +135,7 @@ bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, bool *is_free); +int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) @@ -145,6 +146,7 @@ int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, # define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) # define xfs_verify_rtbno(m, r) (false) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ From 2229276c5283264b8c2241c1ed972bbb136cab22 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 12 Apr 2022 06:49:42 +1000 Subject: [PATCH 6/7] xfs: use a separate frextents counter for rt extent reservations As mentioned in the previous commit, the kernel misuses sb_frextents in the incore mount to reflect both incore reservations made by running transactions as well as the actual count of free rt extents on disk. This results in the superblock being written to the log with an underestimate of the number of rt extents that are marked free in the rtbitmap. Teaching XFS to recompute frextents after log recovery avoids operational problems in the current mount, but it doesn't solve the problem of us writing undercounted frextents which are then recovered by an older kernel that doesn't have that fix. Create an incore percpu counter to mirror the ondisk frextents. This new counter will track transaction reservations and the only time we will touch the incore super counter (i.e the one that gets logged) is when those transactions commit updates to the rt bitmap. This is in contrast to the lazysbcount counters (e.g. fdblocks), where we know that log recovery will always fix any incorrect counter that we log. As a bonus, we only take m_sb_lock at transaction commit time. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 5 +++++ fs/xfs/xfs_fsops.c | 5 +---- fs/xfs/xfs_icache.c | 9 +++++--- fs/xfs/xfs_mount.c | 50 ++++++++++++++++++------------------------ fs/xfs/xfs_mount.h | 19 +++++++++++++--- fs/xfs/xfs_rtalloc.c | 1 + fs/xfs/xfs_super.c | 14 ++++++++++-- fs/xfs/xfs_trans.c | 43 +++++++++++++++++++++++++++++++----- 8 files changed, 99 insertions(+), 47 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..8dd7186ef9df 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -911,6 +911,11 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 68f74549fa22..a0d7aa7fbbff 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -349,10 +349,7 @@ xfs_fs_counts( cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); - - spin_lock(&mp->m_sb_lock); - cnt->freertx = mp->m_sb.sb_frextents; - spin_unlock(&mp->m_sb_lock); + cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); } /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bffd6eb0b298..5269354b1b69 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - uint64_t freertx; if (!XFS_IS_REALTIME_INODE(ip)) return false; - freertx = READ_ONCE(mp->m_sb.sb_frextents); - return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; + if (__percpu_counter_compare(&mp->m_frextents, + mp->m_low_rtexts[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + return false; } #else # define xfs_inodegc_want_queue_rt_file(ip) (false) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 53e130f803b1..0c0bcbd4949d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1110,24 +1110,33 @@ xfs_fs_writable( return true; } +/* Adjust m_fdblocks or m_frextents. */ int -xfs_mod_fdblocks( +xfs_mod_freecounter( struct xfs_mount *mp, + struct percpu_counter *counter, int64_t delta, bool rsvd) { int64_t lcounter; long long res_used; + uint64_t set_aside = 0; s32 batch; - uint64_t set_aside; + bool has_resv_pool; + + ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); + has_resv_pool = (counter == &mp->m_fdblocks); + if (rsvd) + ASSERT(has_resv_pool); if (delta > 0) { /* * If the reserve pool is depleted, put blocks back into it * first. Most of the time the pool is full. */ - if (likely(mp->m_resblks == mp->m_resblks_avail)) { - percpu_counter_add(&mp->m_fdblocks, delta); + if (likely(!has_resv_pool || + mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(counter, delta); return 0; } @@ -1139,7 +1148,7 @@ xfs_mod_fdblocks( } else { delta -= res_used; mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(&mp->m_fdblocks, delta); + percpu_counter_add(counter, delta); } spin_unlock(&mp->m_sb_lock); return 0; @@ -1153,7 +1162,7 @@ xfs_mod_fdblocks( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1170,9 +1179,10 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, + if (has_resv_pool) + set_aside = xfs_fdblocks_unavailable(mp); + percpu_counter_add_batch(counter, delta, batch); + if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; @@ -1183,8 +1193,8 @@ xfs_mod_fdblocks( * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - percpu_counter_add(&mp->m_fdblocks, -delta); - if (!rsvd) + percpu_counter_add(counter, -delta); + if (!has_resv_pool || !rsvd) goto fdblocks_enospc; lcounter = (long long)mp->m_resblks_avail + delta; @@ -1201,24 +1211,6 @@ xfs_mod_fdblocks( return -ENOSPC; } -int -xfs_mod_frextents( - struct xfs_mount *mp, - int64_t delta) -{ - int64_t lcounter; - int ret = 0; - - spin_lock(&mp->m_sb_lock); - lcounter = mp->m_sb.sb_frextents + delta; - if (lcounter < 0) - ret = -ENOSPC; - else - mp->m_sb.sb_frextents = lcounter; - spin_unlock(&mp->m_sb_lock); - return ret; -} - /* * Used to free the superblock along various error paths. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f6dc19de8322..a6b8efb2df52 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,8 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + struct percpu_counter m_frextents; /* free rt extent counter */ + /* * Count of data device blocks reserved for delayed allocations, * including indlen blocks. Does not include allocated CoW staging @@ -494,9 +496,20 @@ xfs_fdblocks_unavailable( return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); } -extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, - bool reserved); -extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); +int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, + int64_t delta, bool rsvd); + +static inline int +xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved) +{ + return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved); +} + +static inline int +xfs_mod_frextents(struct xfs_mount *mp, int64_t delta) +{ + return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false); +} extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 76f50e75f99c..997e4a9d27d3 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1318,6 +1318,7 @@ xfs_rtalloc_reinit_frextents( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); return 0; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54be9d64093e..3a5088646294 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -843,9 +843,11 @@ xfs_fs_statfs( if (XFS_IS_REALTIME_MOUNT(mp) && (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { + s64 freertx; + statp->f_blocks = sbp->sb_rblocks; - statp->f_bavail = statp->f_bfree = - sbp->sb_frextents * sbp->sb_rextsize; + freertx = percpu_counter_sum_positive(&mp->m_frextents); + statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; } return 0; @@ -1015,8 +1017,14 @@ xfs_init_percpu_counters( if (error) goto free_fdblocks; + error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); + if (error) + goto free_delalloc; + return 0; +free_delalloc: + percpu_counter_destroy(&mp->m_delalloc_blks); free_fdblocks: percpu_counter_destroy(&mp->m_fdblocks); free_ifree: @@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters( percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); + percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); } static void @@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters( ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); + percpu_counter_destroy(&mp->m_frextents); } static int diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 0ac717aad380..6d9df2e9b267 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -498,10 +498,31 @@ xfs_trans_apply_sb_deltas( be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); - if (tp->t_res_frextents_delta) - be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + /* + * Updating frextents requires careful handling because it does not + * behave like the lazysb counters because we cannot rely on log + * recovery in older kenels to recompute the value from the rtbitmap. + * This means that the ondisk frextents must be consistent with the + * rtbitmap. + * + * Therefore, log the frextents change to the ondisk superblock and + * update the incore superblock so that future calls to xfs_log_sb + * write the correct value ondisk. + * + * Don't touch m_frextents because it includes incore reservations, + * and those are handled by the unreserve function. + */ + if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + struct xfs_mount *mp = tp->t_mountp; + int64_t rtxdelta; + + rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; + + spin_lock(&mp->m_sb_lock); + be64_add_cpu(&sbp->sb_frextents, rtxdelta); + mp->m_sb.sb_frextents += rtxdelta; + spin_unlock(&mp->m_sb_lock); + } if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); @@ -614,7 +635,12 @@ xfs_trans_unreserve_and_mod_sb( if (ifreedelta) percpu_counter_add(&mp->m_ifree, ifreedelta); - if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + if (rtxdelta) { + error = xfs_mod_frextents(mp, rtxdelta); + ASSERT(!error); + } + + if (!(tp->t_flags & XFS_TRANS_SB_DIRTY)) return; /* apply remaining deltas */ @@ -622,7 +648,12 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; - mp->m_sb.sb_frextents += rtxdelta; + /* + * Do not touch sb_frextents here because we are dealing with incore + * reservation. sb_frextents is not part of the lazy sb counters so it + * must be consistent with the ondisk rtbitmap and must never include + * incore reservations. + */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; From 2d9ac4319b9959bf3195fedf88bdfd224c67593b Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 21 Apr 2022 08:47:54 +1000 Subject: [PATCH 7/7] xfs: simplify local variable assignment in file write code Get the struct inode pointer from iocb->ki_filp->f_mapping->host directly and the other variables are unnecessary, so simplify the local variables assignment. Signed-off-by: Kaixu Xia Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c5541d062d0d..442bfaed202e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -694,9 +694,7 @@ xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; bool cleared_space = false; @@ -767,9 +765,7 @@ xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from);