From 8d2f9f5c64f16e0717854fb66d795ebe8c30103b Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 4 Aug 2025 14:08:14 +0200 Subject: [PATCH 01/53] xfs: allow renames of project-less inodes Special file inodes cannot have project ID set from userspace and are skipped during initial project setup. Those inodes are left project-less in the project directory. New inodes created after project initialization do have an ID. Creating hard links or renaming those project-less inodes then fails on different ID check. In commit e23d7e82b707 ("xfs: allow cross-linking special files without project quota"), we relaxed the project id checks to allow hardlinking special files with differing project ids since the projid cannot be changed. Apply the same workaround for renaming operations. Signed-off-by: Andrey Albershteyn Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 64 ++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..0ddb9ce0f5e3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,6 +877,35 @@ xfs_create_tmpfile( return error; } +static inline int +xfs_projid_differ( + struct xfs_inode *tdp, + struct xfs_inode *sip) +{ + /* + * If we are using project inheritance, we only allow hard link/renames + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + return -EXDEV; + } + } + + return 0; +} + int xfs_link( struct xfs_inode *tdp, @@ -930,27 +959,9 @@ xfs_link( goto error_return; } - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_projid != sip->i_projid)) { - /* - * Project quota setup skips special files which can - * leave inodes in a PROJINHERIT directory without a - * project ID set. We need to allow links to be made - * to these "project-less" inodes because userspace - * expects them to succeed after project ID setup, - * but everything else should be rejected. - */ - if (!special_file(VFS_I(sip)->i_mode) || - sip->i_projid != 0) { - error = -EXDEV; - goto error_return; - } - } + error = xfs_projid_differ(tdp, sip); + if (error) + goto error_return; error = xfs_dir_add_child(tp, resblks, &du); if (error) @@ -2227,16 +2238,9 @@ xfs_rename( if (du_wip.ip) xfs_trans_ijoin(tp, du_wip.ip, 0); - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_projid != src_ip->i_projid)) { - error = -EXDEV; + error = xfs_projid_differ(target_dp, src_ip); + if (error) goto out_trans_cancel; - } /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { From 8a221004fe5288b66503699a329a6b623be13f91 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 4 Aug 2025 14:08:15 +0200 Subject: [PATCH 02/53] xfs: add .fileattr_set and fileattr_get callbacks for symlinks As there are now file_getattr() and file_setattr(), xfs_quota will call them on special files. These new syscalls call ->fileattr_get/set. Symlink inodes don't have callbacks to set file attributes. This patch adds them. The attribute values combinations are checked in fileattr_set_prepare(). Signed-off-by: Andrey Albershteyn Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_iops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee..c0f49bfbed68 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1335,6 +1335,8 @@ static const struct inode_operations xfs_symlink_inode_operations = { .setattr = xfs_vn_setattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; /* Figure out if this file actually supports DAX. */ From 0239bd9fa445a21def88f7e76fe6e0414b2a4da0 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 4 Aug 2025 14:08:16 +0200 Subject: [PATCH 03/53] xfs: allow setting file attributes on special files XFS does't have file attributes manipulation ioctls for special files. Changing or reading file attributes is rejected for them in xfs_fileattr_*et(). In XFS, this is necessary to work for project quota directories. When project is set up, xfs_quota opens and calls FS_IOC_SETFSXATTR on every inode in the directory. However, special files are skipped due to open() returning a special inode for them. So, they don't even get to this check. The recently added file_getattr/file_setattr will call xfs_fileattr_*et, on special files. This patch allows reading/changing file attributes on special files. Signed-off-by: Andrey Albershteyn Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_ioctl.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e1051a530a50..10a3a5160f4e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -512,9 +512,6 @@ xfs_fileattr_get( { struct xfs_inode *ip = XFS_I(d_inode(dentry)); - if (d_is_special(dentry)) - return -ENOTTY; - xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -736,9 +733,6 @@ xfs_fileattr_set( trace_xfs_ioctl_setattr(ip); - if (d_is_special(dentry)) - return -ENOTTY; - if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | From 851c4c96db001f51bdad1432aa54549c7fe2c63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Aug 2025 13:15:00 +0200 Subject: [PATCH 04/53] xfs: implement XFS_IOC_DIOINFO in terms of vfs_getattr Use the direct I/O alignment reporting from ->getattr instead of reimplementing it. This exposes the relaxation of the memory alignment in the XFS_IOC_DIOINFO info and ensure the information will stay in sync. Note that randholes.c in xfstests has a bug where it incorrectly fails when the required memory alignment is smaller than the pointer size. Round up the reported value as there is a fair chance that this code got copied into various applications. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_ioctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 10a3a5160f4e..a6bb7ee7a27a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1203,21 +1203,21 @@ xfs_file_ioctl( current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct kstat st; struct dioattr da; - da.d_mem = target->bt_logical_sectorsize; + error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); + if (error) + return error; /* - * See xfs_report_dioalign() for an explanation about why this - * reports a value larger than the sector size for COW inodes. + * Some userspace directly feeds the return value to + * posix_memalign, which fails for values that are smaller than + * the pointer size. Round up the value to not break userspace. */ - if (xfs_is_cow_inode(ip)) - da.d_miniosz = xfs_inode_alloc_unitsize(ip); - else - da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); + da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; From 33ddc796ecbd50cd6211aa9e9eddbf4567038b49 Mon Sep 17 00:00:00 2001 From: Marcelo Moreira Date: Wed, 27 Aug 2025 19:17:07 -0300 Subject: [PATCH 05/53] xfs: Replace strncpy with memcpy The changes modernizes the code by aligning it with current kernel best practices. It improves code clarity and consistency, as strncpy is deprecated as explained in Documentation/process/deprecated.rst. This change does not alter the functionality or introduce any behavioral changes. Suggested-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Marcelo Moreira Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/symlink_repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 953ce7be78dc..5902398185a8 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -185,7 +185,7 @@ xrep_symlink_salvage_inline( return 0; nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); - strncpy(target_buf, ifp->if_data, nr); + memcpy(target_buf, ifp->if_data, nr); return nr; } From cd32a0c0dcdf634f2e0e71f41c272e19dece6264 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:32 -0700 Subject: [PATCH 06/53] xfs: use deferred intent items for reaping crosslinked blocks When we're removing rmap records for crosslinked blocks, use deferred intent items so that we can try to free/unmap as many of the old data structure's blocks as we can in the same transaction as the commit. Cc: # v6.6 Fixes: 1c7ce115e52106 ("xfs: reap large AG metadata extents when possible") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 8703897c0a9c..86d3d104b8d9 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -416,8 +416,6 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, *aglenp); - rs->force_roll = true; - if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* * If we're unmapping CoW staging extents, remove the @@ -426,11 +424,14 @@ xreap_agextent_iter( */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); + rs->force_roll = true; return 0; } - return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, - *aglenp, rs->oinfo); + xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, + rs->oinfo->oi_owner); + rs->deferred++; + return 0; } trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); From 82e374405e85af2ce3149c74766168df14864697 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:30 -0700 Subject: [PATCH 07/53] xfs: prepare reaping code for dynamic limits The online repair block reaping code employs static limits to decide if it's time to roll the transaction or finish the deferred item chains to avoid overflowing the scrub transaction's reservation. However, the use of static limits aren't great -- btree blocks are assumed to be scattered around the AG and the buffers need to be invalidated, whereas COW staging extents are usually contiguous and do not have buffers. We would like to configure the limits dynamically. To get ready for this, reorganize struct xreap_state to store dynamic limits, and add helpers to hide some of the details of how the limits are enforced. Also rename the "xreap roll" functions to include the word "binval" because they only exist to decide when we should roll the transaction to deal with buffer invalidations. No functional changes intended here. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 149 +++++++++++++++++++++++--------------------- 1 file changed, 79 insertions(+), 70 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86d3d104b8d9..16bd298330a4 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -95,17 +95,17 @@ struct xreap_state { const struct xfs_owner_info *oinfo; enum xfs_ag_resv_type resv; - /* If true, roll the transaction before reaping the next extent. */ - bool force_roll; + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int nr_binval; + + /* Maximum number of buffers we can invalidate in a single tx. */ + unsigned int max_binval; /* Number of deferred reaps attached to the current transaction. */ - unsigned int deferred; + unsigned int nr_deferred; - /* Number of invalidated buffers logged to the current transaction. */ - unsigned int invalidated; - - /* Number of deferred reaps queued during the whole reap sequence. */ - unsigned long long total_deferred; + /* Maximum number of intents we can reap in a single transaction. */ + unsigned int max_deferred; }; /* Put a block back on the AGFL. */ @@ -148,44 +148,36 @@ xreap_put_freelist( } /* Are there any uncommitted reap operations? */ -static inline bool xreap_dirty(const struct xreap_state *rs) +static inline bool xreap_is_dirty(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred) - return true; - if (rs->invalidated) - return true; - if (rs->total_deferred) - return true; - return false; + return rs->nr_binval > 0 || rs->nr_deferred > 0; } #define XREAP_MAX_BINVAL (2048) /* - * Decide if we want to roll the transaction after reaping an extent. We don't - * want to overrun the transaction reservation, so we prohibit more than - * 128 EFIs per transaction. For the same reason, we limit the number - * of buffer invalidations to 2048. + * Decide if we need to roll the transaction to clear out the the log + * reservation that we allocated to buffer invalidations. */ -static inline bool xreap_want_roll(const struct xreap_state *rs) +static inline bool xreap_want_binval_roll(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) - return true; - if (rs->invalidated > XREAP_MAX_BINVAL) - return true; - return false; + return rs->nr_binval >= rs->max_binval; } -static inline void xreap_reset(struct xreap_state *rs) +/* Reset the buffer invalidation count after rolling. */ +static inline void xreap_binval_reset(struct xreap_state *rs) { - rs->total_deferred += rs->deferred; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_binval = 0; +} + +/* + * Bump the number of invalidated buffers, and return true if we can continue, + * or false if we need to roll the transaction. + */ +static inline bool xreap_inc_binval(struct xreap_state *rs) +{ + rs->nr_binval++; + return rs->nr_binval < rs->max_binval; } #define XREAP_MAX_DEFER_CHAIN (2048) @@ -194,25 +186,36 @@ static inline void xreap_reset(struct xreap_state *rs) * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because * that can consume a lot of log space and kernel memory. Hence we trigger a - * xfs_defer_finish if there are more than 2048 deferred reap operations or the - * caller did some real work. + * xfs_defer_finish if there are too many deferred reap operations or we've run + * out of space for invalidations. */ -static inline bool -xreap_want_defer_finish(const struct xreap_state *rs) +static inline bool xreap_want_defer_finish(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) - return true; - return false; + return rs->nr_deferred >= rs->max_deferred; } +/* + * Reset the defer chain length and buffer invalidation count after finishing + * items. + */ static inline void xreap_defer_finish_reset(struct xreap_state *rs) { - rs->total_deferred = 0; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_deferred = 0; + rs->nr_binval = 0; +} + +/* + * Bump the number of deferred extent reaps. + */ +static inline void xreap_inc_defer(struct xreap_state *rs) +{ + rs->nr_deferred++; +} + +/* Force the caller to finish a deferred item chain. */ +static inline void xreap_force_defer_finish(struct xreap_state *rs) +{ + rs->nr_deferred = rs->max_deferred; } /* @@ -297,14 +300,13 @@ xreap_agextent_binval( while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); - rs->invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however * far we've gotten. */ - if (rs->invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { *aglenp -= agbno_next - bno; goto out; } @@ -424,13 +426,13 @@ xreap_agextent_iter( */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, rs->oinfo->oi_owner); - rs->deferred++; + xreap_inc_defer(rs); return 0; } @@ -444,7 +446,7 @@ xreap_agextent_iter( */ xreap_agextent_binval(rs, agbno, aglenp); if (*aglenp == 0) { - ASSERT(xreap_want_roll(rs)); + ASSERT(xreap_want_binval_roll(rs)); return 0; } @@ -464,7 +466,7 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } @@ -475,7 +477,7 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } @@ -490,8 +492,8 @@ xreap_agextent_iter( if (error) return error; - rs->deferred++; - if (rs->deferred % 2 == 0) + xreap_inc_defer(rs); + if (rs->nr_deferred % 2 == 0) xfs_defer_add_barrier(sc->tp); return 0; } @@ -532,11 +534,11 @@ xreap_agmeta_extent( if (error) return error; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xrep_roll_ag_trans(sc); if (error) return error; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -557,6 +559,8 @@ xrep_reap_agblocks( .sc = sc, .oinfo = oinfo, .resv = type, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -567,7 +571,7 @@ xrep_reap_agblocks( if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -629,7 +633,7 @@ xreap_fsmeta_extent( if (error) goto out_agf; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { /* * Hold the AGF buffer across the transaction roll so * that we don't have to reattach it to the scrub @@ -640,7 +644,7 @@ xreap_fsmeta_extent( xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); if (error) goto out_agf; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -669,6 +673,8 @@ xrep_reap_fsblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -679,7 +685,7 @@ xrep_reap_fsblocks( if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -779,7 +785,7 @@ xreap_rgextent_iter( *rglenp); xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); - rs->deferred++; + xreap_inc_defer(rs); return 0; } @@ -800,7 +806,7 @@ xreap_rgextent_iter( if (error) return error; - rs->deferred++; + xreap_inc_defer(rs); return 0; } @@ -856,11 +862,11 @@ xreap_rtmeta_extent( if (error) goto out_unlock; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xfs_trans_roll_inode(&sc->tp, sc->ip); if (error) goto out_unlock; - xreap_reset(rs); + xreap_binval_reset(rs); } rgbno += rglen; @@ -887,6 +893,8 @@ xrep_reap_rtblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -897,7 +905,7 @@ xrep_reap_rtblocks( if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -923,6 +931,8 @@ xrep_reap_metadir_fsblocks( .sc = sc, .oinfo = &oinfo, .resv = XFS_AG_RESV_NONE, + .max_binval = XREAP_MAX_BINVAL, + .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -931,12 +941,11 @@ xrep_reap_metadir_fsblocks( ASSERT(xfs_is_metadir_inode(sc->ip)); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); - error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) { + if (xreap_is_dirty(&rs)) { error = xrep_defer_finish(sc); if (error) return error; From ef930cc371f033d9ce21292ca577741dfeb8c7f3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:31 -0700 Subject: [PATCH 08/53] xfs: convert the ifork reap code to use xreap_state Convert the file fork reaping code to use struct xreap_state so that we can reuse the dynamic state tracking code. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 78 ++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 16bd298330a4..33272729249f 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -91,9 +91,21 @@ struct xreap_state { struct xfs_scrub *sc; - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; + union { + struct { + /* + * For AG blocks, this is reverse mapping owner and + * metadata reservation type. + */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + }; + struct { + /* For file blocks, this is the inode and fork. */ + struct xfs_inode *ip; + int whichfork; + }; + }; /* Number of invalidated buffers logged to the current transaction. */ unsigned int nr_binval; @@ -965,13 +977,12 @@ xrep_reap_metadir_fsblocks( */ STATIC int xreap_bmapi_select( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool *crosslinked) { struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; xfs_filblks_t len = 1; xfs_agblock_t bno; @@ -985,7 +996,8 @@ xreap_bmapi_select( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, sc->sa.pag); - xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, + imap->br_startoff); error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); if (error) goto out_cur; @@ -1048,21 +1060,19 @@ xreap_buf_loggable( */ STATIC int xreap_bmapi_binval( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag = sc->sa.pag; - int bmap_flags = xfs_bmapi_aflag(whichfork); + int bmap_flags = xfs_bmapi_aflag(rs->whichfork); xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; - unsigned int invalidated = 0; int error; /* @@ -1089,7 +1099,7 @@ xreap_bmapi_binval( struct xfs_bmbt_irec hmap; int nhmaps = 1; - error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, &nhmaps, bmap_flags); if (error) return error; @@ -1130,14 +1140,13 @@ xreap_bmapi_binval( xfs_buf_stale(bp); xfs_buf_relse(bp); } - invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however - * much of the mapping we've seen so far. + * far we've gotten. */ - if (invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { imap->br_blockcount = agbno_next - bno; goto out; } @@ -1159,12 +1168,11 @@ xreap_bmapi_binval( */ STATIC int xrep_reap_bmapi_iter( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool crosslinked) { + struct xfs_scrub *sc = rs->sc; int error; if (crosslinked) { @@ -1185,10 +1193,10 @@ xrep_reap_bmapi_iter( * deferred log intents in this function to control the exact * sequence of metadata updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); - xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); return 0; } @@ -1209,7 +1217,7 @@ xrep_reap_bmapi_iter( * transaction is full of logged buffer invalidations, so we need to * return early so that we can roll and retry. */ - error = xreap_bmapi_binval(sc, ip, whichfork, imap); + error = xreap_bmapi_binval(rs, imap); if (error || imap->br_blockcount == 0) return error; @@ -1218,8 +1226,8 @@ xrep_reap_bmapi_iter( * intents in this function to control the exact sequence of metadata * updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, imap->br_blockcount, NULL, XFS_AG_RESV_NONE, @@ -1232,18 +1240,17 @@ xrep_reap_bmapi_iter( */ STATIC int xreap_ifork_extent( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; xfs_agnumber_t agno; bool crosslinked; int error; ASSERT(sc->sa.pag == NULL); - trace_xreap_ifork_extent(sc, ip, whichfork, imap); + trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); sc->sa.pag = xfs_perag_get(sc->mp, agno); @@ -1258,11 +1265,11 @@ xreap_ifork_extent( * Decide the fate of the blocks at the beginning of the mapping, then * update the mapping to use it with the unmap calls. */ - error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + error = xreap_bmapi_select(rs, imap, &crosslinked); if (error) goto out_agf; - error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + error = xrep_reap_bmapi_iter(rs, imap, crosslinked); if (error) goto out_agf; @@ -1286,6 +1293,12 @@ xrep_reap_ifork( struct xfs_inode *ip, int whichfork) { + struct xreap_state rs = { + .sc = sc, + .ip = ip, + .whichfork = whichfork, + .max_binval = XREAP_MAX_BINVAL, + }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); int error; @@ -1313,13 +1326,14 @@ xrep_reap_ifork( * can in a single transaction. */ if (xfs_bmap_is_real_extent(&imap)) { - error = xreap_ifork_extent(sc, ip, whichfork, &imap); + error = xreap_ifork_extent(&rs, &imap); if (error) return error; error = xfs_defer_finish(&sc->tp); if (error) return error; + xreap_defer_finish_reset(&rs); } off = imap.br_startoff + imap.br_blockcount; From b2311ec6778fcde9d40cd9e22899f8bd594d8465 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:33 -0700 Subject: [PATCH 09/53] xfs: compute per-AG extent reap limits dynamically Calculate the maximum number of extents that can be reaped in a single transaction chain, and the number of buffers that can be invalidated in a single transaction. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used by intents per extent + space used per binval) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 140 +++++++++++++++++++++++++++++++++++++++---- fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 42 +++++++++++++ 3 files changed, 171 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 33272729249f..929ea3c453d3 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -36,6 +36,12 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_bmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -230,6 +236,15 @@ static inline void xreap_force_defer_finish(struct xreap_state *rs) rs->nr_deferred = rs->max_deferred; } +/* Maximum number of fsblocks that we might find in a buffer to invalidate. */ +static inline unsigned int +xrep_binval_max_fsblocks( + struct xfs_mount *mp) +{ + /* Remote xattr values are the largest buffers that we support. */ + return xfs_attr3_max_rmt_blocks(mp); +} + /* * Compute the maximum length of a buffer cache scan (in units of sectors), * given a quantity of fs blocks. @@ -239,12 +254,8 @@ xrep_bufscan_max_sectors( struct xfs_mount *mp, xfs_extlen_t fsblocks) { - int max_fsbs; - - /* Remote xattr values are the largest buffers that we support. */ - max_fsbs = xfs_attr3_max_rmt_blocks(mp); - - return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, + xrep_binval_max_fsblocks(mp))); } /* @@ -442,6 +453,7 @@ xreap_agextent_iter( return 0; } + /* t1: unmap crosslinked metadata blocks */ xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, rs->oinfo->oi_owner); xreap_inc_defer(rs); @@ -482,7 +494,7 @@ xreap_agextent_iter( return 0; } - /* Put blocks back on the AGFL one at a time. */ + /* t3: Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); error = xreap_put_freelist(sc, agbno); @@ -494,7 +506,7 @@ xreap_agextent_iter( } /* - * Use deferred frees to get rid of the old btree blocks to try to + * t4: Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. * Add a defer ops barrier every other extent to avoid stressing the * system with large EFIs. @@ -510,6 +522,110 @@ xreap_agextent_iter( return 0; } +/* Configure the deferral and invalidation limits */ +static inline void +xreap_configure_limits( + struct xreap_state *rs, + unsigned int fixed_overhead, + unsigned int variable_overhead, + unsigned int per_intent, + unsigned int per_binval) +{ + struct xfs_scrub *sc = rs->sc; + unsigned int res = sc->tp->t_log_res - fixed_overhead; + + /* Don't underflow the reservation */ + if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { + ASSERT(sc->tp->t_log_res >= + (fixed_overhead + variable_overhead)); + xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); + return; + } + + rs->max_deferred = res / variable_overhead; + res -= rs->max_deferred * per_intent; + rs->max_binval = per_binval ? res / per_binval : 0; +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single per-AG space extent. This is not for freeing CoW + * staging extents. + */ +STATIC void +xreap_configure_agextent_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap + * record. + * + * t3: Freeing to AGFL: roll and finish deferred items for every block. + * Limits here do not matter. + * + * t4: Freeing metadata blocks: deferred freeing of the space, which + * also removes the rmap record. + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = rui; + const unsigned int t4 = rui + efi; + const unsigned int per_intent = max(t1, t4); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of EFI or + * RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int step_size = max(f1, f2); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Maximum overhead of invalidating one buffer. */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that btree + * blocks aren't usually contiguous; and that scrub likely pulled all + * the buffers into memory. From these assumptions, set the maximum + * number of deferrals we can queue before flushing the defer chain, + * and the number of invalidations we can queue before rolling to a + * clean transaction (and possibly relogging some of the deferrals) to + * the same quantity. + */ + const unsigned int variable_overhead = per_intent + per_binval; + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -571,14 +687,13 @@ xrep_reap_agblocks( .sc = sc, .oinfo = oinfo, .resv = type, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip == NULL); + xreap_configure_agextent_limits(&rs); error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); if (error) return error; @@ -693,6 +808,8 @@ xrep_reap_fsblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + if (oinfo != &XFS_RMAP_OINFO_COW) + xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; @@ -943,8 +1060,6 @@ xrep_reap_metadir_fsblocks( .sc = sc, .oinfo = &oinfo, .resv = XFS_AG_RESV_NONE, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; @@ -952,6 +1067,7 @@ xrep_reap_metadir_fsblocks( ASSERT(sc->ip != NULL); ASSERT(xfs_is_metadir_inode(sc->ip)); + xreap_configure_agextent_limits(&rs); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2450e214103f..987313a52e64 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,6 +22,7 @@ #include "xfs_parent.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" +#include "xfs_trans.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a8187281eb96..d39da0e67024 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2000,6 +2000,48 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_limits_class, + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, + unsigned int max_binval, unsigned int step_size, + unsigned int per_intent, + unsigned int max_deferred), + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, log_res) + __field(unsigned int, per_binval) + __field(unsigned int, max_binval) + __field(unsigned int, step_size) + __field(unsigned int, per_intent) + __field(unsigned int, max_deferred) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->log_res = tp->t_log_res; + __entry->per_binval = per_binval; + __entry->max_binval = max_binval; + __entry->step_size = step_size; + __entry->per_intent = per_intent; + __entry->max_deferred = max_deferred; + ), + TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->log_res, + __entry->per_binval, + __entry->max_binval, + __entry->step_size, + __entry->per_intent, + __entry->max_deferred) +); +#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \ +DEFINE_EVENT(xrep_reap_limits_class, name, \ + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \ + unsigned int max_binval, unsigned int step_size, \ + unsigned int per_intent, \ + unsigned int max_deferred), \ + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); + DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), From 442bc127d460a807858ef9258e77518b9597eed1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:33 -0700 Subject: [PATCH 10/53] xfs: compute data device CoW staging extent reap limits dynamically Calculate the maximum number of CoW staging extents that can be reaped in a single transaction chain. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used by intents per extent + space used for a few buffer invalidations) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 88 +++++++++++++++++++++++++++++++++++++++++--- fs/xfs/scrub/trace.h | 1 + 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 929ea3c453d3..aaef7e6771a0 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -443,7 +443,7 @@ xreap_agextent_iter( if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* - * If we're unmapping CoW staging extents, remove the + * t0: Unmapping CoW staging extents, remove the * records from the refcountbt, which will remove the * rmap record as well. */ @@ -475,7 +475,7 @@ xreap_agextent_iter( } /* - * If we're getting rid of CoW staging extents, use deferred work items + * t2: To get rid of CoW staging extents, use deferred work items * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -626,6 +626,84 @@ xreap_configure_agextent_limits( step_size, per_intent, rs->max_deferred); } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_agcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t0 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t0, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Overhead of invalidating one buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that CoW + * staging extents are usually more than 1 fsblock, and that there + * shouldn't be any buffers for those blocks. From the assumptions, + * set the number of deferrals to use as much of the reservation as + * it can, but leave space to invalidate 1/8th that number of buffers. + */ + const unsigned int variable_overhead = per_intent + + (per_binval / 8); + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, + per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -800,15 +878,15 @@ xrep_reap_fsblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); - if (oinfo != &XFS_RMAP_OINFO_COW) + if (oinfo == &XFS_RMAP_OINFO_COW) + xreap_configure_agcow_limits(&rs); + else xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index d39da0e67024..a9da22f50534 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2041,6 +2041,7 @@ DEFINE_EVENT(xrep_reap_limits_class, name, \ unsigned int max_deferred), \ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, From 74fc66ee17fcd994a49ed80ddba90bc3b7046684 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:34 -0700 Subject: [PATCH 11/53] xfs: compute realtime device CoW staging extent reap limits dynamically Calculate the maximum number of CoW staging extents that can be reaped in a single transaction chain. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used by intents per extent) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 71 +++++++++++++++++++++++++++++++++++++++++--- fs/xfs/scrub/trace.h | 1 + 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index aaef7e6771a0..b2f089e2c49d 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -984,7 +984,7 @@ xreap_rgextent_iter( rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); /* - * If there are other rmappings, this block is cross linked and must + * t1: There are other rmappings; this block is cross linked and must * not be freed. Remove the forward and reverse mapping and move on. */ if (crosslinked) { @@ -999,7 +999,7 @@ xreap_rgextent_iter( trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); /* - * The CoW staging extent is not crosslinked. Use deferred work items + * t2: The CoW staging extent is not crosslinked. Use deferred work * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -1017,6 +1017,69 @@ xreap_rgextent_iter( return 0; } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_rgcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * The only buffer for the rt device is the rtgroup super, so we don't + * need to save space for buffer invalidations. + */ + xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); + + trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, + rs->max_deferred); +} + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_RMAP | \ XFS_RTGLOCK_REFCOUNT) @@ -1100,14 +1163,14 @@ xrep_reap_rtblocks( .sc = sc, .oinfo = oinfo, .resv = XFS_AG_RESV_NONE, - .max_binval = XREAP_MAX_BINVAL, - .max_deferred = XREAP_MAX_DEFER_CHAIN, }; int error; ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + ASSERT(oinfo == &XFS_RMAP_OINFO_COW); + xreap_configure_rgcow_limits(&rs); error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); if (error) return error; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a9da22f50534..1a994d339c42 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2042,6 +2042,7 @@ DEFINE_EVENT(xrep_reap_limits_class, name, \ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, From e4c7eece7676bac436a8f98b91d646a8b55a67af Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:35 -0700 Subject: [PATCH 12/53] xfs: compute file mapping reap limits dynamically Reaping file fork mappings is a little different -- log recovery can free the blocks for us, so we only try to process a single mapping at a time. Therefore, we only need to figure out the maximum number of blocks that we can invalidate in a single transaction. The rough calculation here is: nr_extents = (logres - reservation used by any one step) / (space used per binval) Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 109 +++++++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/trace.h | 1 + 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b2f089e2c49d..d58fd57aaebb 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -542,7 +542,7 @@ xreap_configure_limits( return; } - rs->max_deferred = res / variable_overhead; + rs->max_deferred = per_intent ? res / variable_overhead : 0; res -= rs->max_deferred * per_intent; rs->max_binval = per_binval ? res / per_binval : 0; } @@ -1446,7 +1446,7 @@ xrep_reap_bmapi_iter( imap->br_blockcount); /* - * Schedule removal of the mapping from the fork. We use + * t0: Schedule removal of the mapping from the fork. We use * deferred log intents in this function to control the exact * sequence of metadata updates. */ @@ -1479,8 +1479,8 @@ xrep_reap_bmapi_iter( return error; /* - * Schedule removal of the mapping from the fork. We use deferred log - * intents in this function to control the exact sequence of metadata + * t1: Schedule removal of the mapping from the fork. We use deferred + * work in this function to control the exact sequence of metadata * updates. */ xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); @@ -1491,6 +1491,105 @@ xrep_reap_bmapi_iter( XFS_FREE_EXTENT_SKIP_DISCARD); } +/* Compute the maximum mapcount of a file buffer. */ +static unsigned int +xreap_bmapi_binval_mapcount( + struct xfs_scrub *sc) +{ + /* directory blocks can span multiple fsblocks and be discontiguous */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) + return sc->mp->m_dir_geo->fsbcount; + + /* all other file xattr/symlink blocks must be contiguous */ + return 1; +} + +/* Compute the maximum block size of a file buffer. */ +static unsigned int +xreap_bmapi_binval_blocksize( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_DIR: + return sc->mp->m_dir_geo->blksize; + case XFS_SCRUB_TYPE_XATTR: + case XFS_SCRUB_TYPE_PARENT: + /* + * The xattr structure itself consists of single fsblocks, but + * there could be remote xattr blocks to invalidate. + */ + return XFS_XATTR_SIZE_MAX; + } + + /* everything else is a single block */ + return sc->mp->m_sb.sb_blocksize; +} + +/* + * Compute the maximum number of buffer invalidations that we can do while + * reaping a single extent from a file fork. + */ +STATIC void +xreap_configure_bmapi_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* overhead of invalidating a buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), + xreap_bmapi_binval_blocksize(sc)); + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int bui = xfs_bui_log_space(1) + + xfs_bud_log_space(); + + /* + * t1: Unmapping crosslinked file data blocks: one bmap deletion, + * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. + * + * t2: Freeing freeing file data blocks: one bmap deletion, possibly an + * EFI for underfilled bmbt blocks, and another EFI for the space + * itself. + */ + const unsigned int t1 = (bui + efi) + rui; + const unsigned int t2 = (bui + efi) + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * Each call to xreap_ifork_extent starts with a clean transaction and + * operates on a single mapping by creating a chain of log intent items + * for that mapping. We need to leave enough reservation in the + * transaction to log btree buffer and inode updates for each step in + * the chain, and to relog the log intents. + */ + const unsigned int per_extent_res = per_intent + step_size; + + xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); + + trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, 1); +} + /* * Dispose of as much of this file extent as we can. Upon successful return, * the imap will reflect the mapping that was removed from the fork. @@ -1554,7 +1653,6 @@ xrep_reap_ifork( .sc = sc, .ip = ip, .whichfork = whichfork, - .max_binval = XREAP_MAX_BINVAL, }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); @@ -1564,6 +1662,7 @@ xrep_reap_ifork( ASSERT(ip == sc->ip || ip == sc->tempip); ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + xreap_configure_bmapi_limits(&rs); while (off < XFS_MAX_FILEOFF) { struct xfs_bmbt_irec imap; int nimaps = 1; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 1a994d339c42..39ea651cbb75 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2043,6 +2043,7 @@ DEFINE_EVENT(xrep_reap_limits_class, name, \ DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits); DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, From f69260511c69888bdec048153d0d475d4c8b0b3e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2025 09:30:58 -0700 Subject: [PATCH 13/53] xfs: disable deprecated features by default in Kconfig We promised to turn off these old features by default in September 2025. Do so now. Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 5 ++--- fs/xfs/Kconfig | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index a18328a5fb93..693b09ca6292 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -253,9 +253,8 @@ latest version and try again. The deprecation will take place in two parts. Support for mounting V4 filesystems can now be disabled at kernel build time via Kconfig option. -The option will default to yes until September 2025, at which time it -will be changed to default to no. In September 2030, support will be -removed from the codebase entirely. +These options were changed to default to no in September 2025. In +September 2030, support will be removed from the codebase entirely. Note: Distributors may choose to withdraw V4 format support earlier than the dates listed above. diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 065953475cf5..ecebd3ebab13 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -25,7 +25,7 @@ config XFS_FS config XFS_SUPPORT_V4 bool "Support deprecated V4 (crc=0) format" depends on XFS_FS - default y + default n help The V4 filesystem format lacks certain features that are supported by the V5 format, such as metadata checksumming, strengthened @@ -40,7 +40,7 @@ config XFS_SUPPORT_V4 filesystem is a V4 filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the V4 format will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -50,7 +50,7 @@ config XFS_SUPPORT_V4 config XFS_SUPPORT_ASCII_CI bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" depends on XFS_FS - default y + default n help The ASCII case insensitivity filesystem feature only works correctly on systems that have been coerced into using ISO 8859-1, and it does @@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI filesystem is a case-insensitive filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the feature will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. From b9a176e54162f890aaf50ac8a467d725ed2f00df Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Sep 2025 14:33:53 -0700 Subject: [PATCH 14/53] xfs: remove deprecated mount options These four mount options were scheduled for removal in September 2025, so remove them now. Cc: preichl@redhat.com Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 26 ++------------ fs/xfs/libxfs/xfs_attr_leaf.c | 23 +++--------- fs/xfs/libxfs/xfs_bmap.c | 14 ++------ fs/xfs/libxfs/xfs_ialloc.c | 4 +-- fs/xfs/libxfs/xfs_sb.c | 9 ++--- fs/xfs/xfs_icache.c | 6 ++-- fs/xfs/xfs_mount.c | 13 ------- fs/xfs/xfs_mount.h | 12 +++---- fs/xfs/xfs_super.c | 60 ++----------------------------- 9 files changed, 25 insertions(+), 142 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 693b09ca6292..7ad746a3e66c 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -34,22 +34,6 @@ When mounting an XFS filesystem, the following options are accepted. to the file. Specifying a fixed ``allocsize`` value turns off the dynamic behaviour. - attr2 or noattr2 - The options enable/disable an "opportunistic" improvement to - be made in the way inline extended attributes are stored - on-disk. When the new form is used for the first time when - ``attr2`` is selected (either when setting or removing extended - attributes) the on-disk superblock feature bit field will be - updated to reflect this format being in use. - - The default behaviour is determined by the on-disk feature - bit indicating that ``attr2`` behaviour is active. If either - mount option is set, then that becomes the new default used - by the filesystem. - - CRC enabled filesystems always use the ``attr2`` format, and so - will reject the ``noattr2`` mount option if it is set. - discard or nodiscard (default) Enable/disable the issuing of commands to let the block device reclaim space freed by the filesystem. This is @@ -75,12 +59,6 @@ When mounting an XFS filesystem, the following options are accepted. across the entire filesystem rather than just on directories configured to use it. - ikeep or noikeep (default) - When ``ikeep`` is specified, XFS does not delete empty inode - clusters and keeps them around on disk. When ``noikeep`` is - specified, empty inode clusters are returned to the free - space pool. - inode32 or inode64 (default) When ``inode32`` is specified, it indicates that XFS limits inode creation to locations which will not result in inode @@ -267,8 +245,6 @@ Deprecated Mount Options ============================ ================ Mounting with V4 filesystem September 2030 Mounting ascii-ci filesystem September 2030 -ikeep/noikeep September 2025 -attr2/noattr2 September 2025 ============================ ================ @@ -284,6 +260,8 @@ Removed Mount Options osyncisdsync/osyncisosync v4.0 barrier v4.19 nobarrier v4.19 + ikeep/noikeep v6.18 + attr2/noattr2 v6.18 =========================== ======= sysctls diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fddb55605e0c..47213e6023dd 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit( /* * For attr2 we can try to move the forkoff if there is space in the - * literal area, but for the old format we are done if there is no - * space in the fixed attribute fork. + * literal area */ - if (!xfs_has_attr2(mp)) - return 0; - dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { @@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: - * - noattr2 mount option is set, - * - on-disk version bit says it is already set, or - * - the attr2 mount option is not set to enable automatic upgrade from attr1. + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless + * on-disk version bit says it is already set */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { - if (xfs_has_noattr2(mp)) - return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; - if (!xfs_has_attr2(mp)) - return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); @@ -889,7 +879,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { @@ -900,7 +890,6 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, @@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if (xfs_has_attr2(dp->i_mount) && - (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform( * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { - ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d954f9b8071f..80bdb537fcf7 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local( static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, - int size, - int *version) + int size) { int default_size = xfs_default_attroffset(ip) >> 3; @@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if (xfs_has_attr2(ip->i_mount) && version) - *version = 2; break; default: ASSERT(0); @@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; - int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork( ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_bmap_set_attrforkoff(ip, size, &version); + error = xfs_bmap_set_attrforkoff(ip, size); if (error) return error; @@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) return error; - if (!xfs_has_attr(mp) || - (!xfs_has_attr2(mp) && version == 2)) { + if (!xfs_has_attr(mp)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); - log_sb = true; - } - if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 750111634d9f..5fefdd4fe75d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2140,7 +2140,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); @@ -2286,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 711e180f9ebb..cdd16dd805d7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -142,8 +142,6 @@ xfs_sb_version_to_features( if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; - if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) - features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) @@ -155,7 +153,7 @@ xfs_sb_version_to_features( /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | - XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ @@ -1524,7 +1522,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | - XFS_FSOP_GEOM_FLAGS_EXTFLG; + XFS_FSOP_GEOM_FLAGS_EXTFLG | + XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) @@ -1537,8 +1536,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_has_attr2(mp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4cf7abe50143..e44040206851 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -646,8 +646,7 @@ xfs_iget_cache_miss( goto out_destroy; /* - * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can + * For version 5 superblocks, if we are initialising a new inode, we * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -655,8 +654,7 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_has_v3inodes(mp) && - (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { + if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index dc32c5e34d81..0953f6ae94ab 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1057,19 +1057,6 @@ xfs_mountfs( xfs_inodegc_start(mp); xfs_blockgc_start(mp); - /* - * Now that we've recovered any pending superblock feature bit - * additions, we can finish setting up the attr2 behaviour for the - * mount. The noattr2 option overrides the superblock flag, so only - * check the superblock feature flag if the mount option is not set. - */ - if (xfs_has_noattr2(mp)) { - mp->m_features &= ~XFS_FEAT_ATTR2; - } else if (!xfs_has_attr2(mp) && - (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { - mp->m_features |= XFS_FEAT_ATTR2; - } - if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 97de44c32272..f046d1215b04 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -363,7 +363,6 @@ typedef struct xfs_mount { #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ -#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ @@ -386,7 +385,6 @@ typedef struct xfs_mount { /* Mount features */ #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ -#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred @@ -396,7 +394,6 @@ typedef struct xfs_mount { #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ -#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ @@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +static inline void xfs_add_attr2(struct xfs_mount *mp) +{ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) + xfs_sb_version_addattr2(&mp->m_sb); +} + /* * Mount features * @@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) * bit inodes and read-only state, are kept as operational state rather than * features. */ -__XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) @@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) -__XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..77acb3e5a4ec 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -105,8 +105,8 @@ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, - Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, @@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("norecovery", Opt_norecovery), fsparam_flag("inode64", Opt_inode64), fsparam_flag("inode32", Opt_inode32), - fsparam_flag("ikeep", Opt_ikeep), - fsparam_flag("noikeep", Opt_noikeep), fsparam_flag("largeio", Opt_largeio), fsparam_flag("nolargeio", Opt_nolargeio), - fsparam_flag("attr2", Opt_attr2), - fsparam_flag("noattr2", Opt_noattr2), fsparam_flag("filestreams", Opt_filestreams), fsparam_flag("quota", Opt_quota), fsparam_flag("noquota", Opt_noquota), @@ -175,13 +171,11 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_FEAT_IKEEP, ",ikeep" }, { XFS_FEAT_WSYNC, ",wsync" }, { XFS_FEAT_NOALIGN, ",noalign" }, { XFS_FEAT_SWALLOC, ",swalloc" }, { XFS_FEAT_NOUUID, ",nouuid" }, { XFS_FEAT_NORECOVERY, ",norecovery" }, - { XFS_FEAT_ATTR2, ",attr2" }, { XFS_FEAT_FILESTREAMS, ",filestreams" }, { XFS_FEAT_GRPID, ",grpid" }, { XFS_FEAT_DISCARD, ",discard" }, @@ -1087,15 +1081,6 @@ xfs_finish_flags( } } - /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - /* * prohibit r/w mounts of read-only filesystems */ @@ -1542,22 +1527,6 @@ xfs_fs_parse_param( return 0; #endif /* Following mount options will be removed in September 2025 */ - case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); - parsing_mp->m_features |= XFS_FEAT_IKEEP; - return 0; - case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); - parsing_mp->m_features &= ~XFS_FEAT_IKEEP; - return 0; - case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); - parsing_mp->m_features |= XFS_FEAT_ATTR2; - return 0; - case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1593,16 +1562,6 @@ xfs_fs_validate_params( return -EINVAL; } - /* - * We have not read the superblock at this point, so only the attr2 - * mount option can set the attr2 feature by this stage. - */ - if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); - return -EINVAL; - } - - if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); @@ -2177,21 +2136,6 @@ xfs_fs_reconfigure( if (error) return error; - /* attr2 -> noattr2 */ - if (xfs_has_noattr2(new_mp)) { - if (xfs_has_crc(mp)) { - xfs_warn(mp, - "attr2 is always enabled for a V5 filesystem - can't be changed."); - return -EINVAL; - } - mp->m_features &= ~XFS_FEAT_ATTR2; - mp->m_features |= XFS_FEAT_NOATTR2; - } else if (xfs_has_attr2(new_mp)) { - /* noattr2 -> attr2 */ - mp->m_features &= ~XFS_FEAT_NOATTR2; - mp->m_features |= XFS_FEAT_ATTR2; - } - /* Validate new max_atomic_write option before making other changes */ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { error = xfs_set_max_atomic_write_opt(mp, From d5b157e088c9fb72e2f245fd5cd43a373c4de677 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 16:14:36 -0700 Subject: [PATCH 15/53] xfs: remove static reap limits from repair.h Delete XREAP_MAX_BINVAL and XREAP_MAX_DEFER_CHAIN because the reap code now calculates those limits dynamically, so they're no longer needed. Move the third limit (XREP_MAX_ITRUNCATE_EFIS) to the one file that uses it. Note that the btree rebuilding code should reserve exactly the number of blocks needed to rebuild a btree, so it is rare that the newbt code will need to add any EFIs to the commit transaction. That's why that static limit remains. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/newbt.c | 9 +++++++++ fs/xfs/scrub/reap.c | 4 ---- fs/xfs/scrub/repair.h | 8 -------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 1588ce971cb8..951ae8b71566 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -27,6 +27,15 @@ #include "scrub/repair.h" #include "scrub/newbt.h" +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. The newbt code should reserve + * exactly the correct number of blocks to rebuild the btree, so there should + * not be any excess blocks to free when committing a new btree. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + /* * Estimate proper slack values for a btree that's being reloaded. * diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index d58fd57aaebb..82910188111d 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -171,8 +171,6 @@ static inline bool xreap_is_dirty(const struct xreap_state *rs) return rs->nr_binval > 0 || rs->nr_deferred > 0; } -#define XREAP_MAX_BINVAL (2048) - /* * Decide if we need to roll the transaction to clear out the the log * reservation that we allocated to buffer invalidations. @@ -198,8 +196,6 @@ static inline bool xreap_inc_binval(struct xreap_state *rs) return rs->nr_binval < rs->max_binval; } -#define XREAP_MAX_DEFER_CHAIN (2048) - /* * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9c04295742c8..2bb125c4f9bf 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR -/* - * This is the maximum number of deferred extent freeing item extents (EFIs) - * that we'll attach to a transaction without rolling the transaction to avoid - * overrunning a tr_itruncate reservation. - */ -#define XREP_MAX_ITRUNCATE_EFIS (128) - - /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); From 21d59d00221e4ecbcb597eec0021c667477d3335 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Sep 2025 14:38:27 -0700 Subject: [PATCH 16/53] xfs: remove deprecated sysctl knobs These sysctl knobs were scheduled for removal in September 2025. That time has come, so remove them. Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 26 ++++---------------------- fs/xfs/libxfs/xfs_inode_util.c | 11 ----------- fs/xfs/xfs_globals.c | 2 -- fs/xfs/xfs_iops.c | 12 +++++------- fs/xfs/xfs_linux.h | 2 -- fs/xfs/xfs_sysctl.c | 29 +---------------------------- fs/xfs/xfs_sysctl.h | 3 --- 7 files changed, 10 insertions(+), 75 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 7ad746a3e66c..d6f531f2c0e6 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -289,9 +289,6 @@ The following sysctls are available for the XFS filesystem: removes unused preallocation from clean inodes and releases the unused space back to the free pool. - fs.xfs.speculative_cow_prealloc_lifetime - This is an alias for speculative_prealloc_lifetime. - fs.xfs.error_level (Min: 0 Default: 3 Max: 11) A volume knob for error reporting when internal errors occur. This will generate detailed messages & backtraces for filesystem @@ -318,17 +315,6 @@ The following sysctls are available for the XFS filesystem: This option is intended for debugging only. - fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1) - Controls whether symlinks are created with mode 0777 (default) - or whether their mode is affected by the umask (irix mode). - - fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1) - Controls files created in SGID directories. - If the group ID of the new file does not match the effective group - ID or one of the supplementary group IDs of the parent dir, the - ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl - is set. - fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1) Setting this to "1" will cause the "sync" flag set by the **xfs_io(8)** chattr command on a directory to be @@ -364,14 +350,7 @@ The following sysctls are available for the XFS filesystem: Deprecated Sysctls ================== -=========================================== ================ - Name Removal Schedule -=========================================== ================ -fs.xfs.irix_sgid_inherit September 2025 -fs.xfs.irix_symlink_mode September 2025 -fs.xfs.speculative_cow_prealloc_lifetime September 2025 -=========================================== ================ - +None currently. Removed Sysctls =============== @@ -381,6 +360,9 @@ Removed Sysctls ============================= ======= fs.xfs.xfsbufd_centisec v4.0 fs.xfs.age_buffer_centisecs v4.0 + fs.xfs.irix_symlink_mode v6.18 + fs.xfs.irix_sgid_inherit v6.18 + fs.xfs.speculative_cow_prealloc_lifetime v6.18 ============================= ======= Error handling diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 48fe49a5f050..309ce6dd5553 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -299,17 +299,6 @@ xfs_inode_init( } else { inode_init_owner(args->idmap, inode, dir, args->mode); } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - ip->i_projid = xfs_get_initial_prid(pip); } diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f6f628c01feb..566fd663c95b 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -14,8 +14,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 603effabe1ee..afd041e28bb2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -431,14 +431,12 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); error = xfs_dentry_mode_to_name(&name, dentry, mode); if (unlikely(error)) goto out; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9a2221b4aa21..4dd747bdbcca 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t; #undef XFS_NATIVE_HOST #endif -#define irix_sgid_inherit xfs_params.sgid_inherit.val -#define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 751dc74a3067..9918f14b4874 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC int +static inline int xfs_deprecated_dointvec_minmax( const struct ctl_table *ctl, int write, @@ -67,24 +67,6 @@ xfs_deprecated_dointvec_minmax( } static const struct ctl_table xfs_table[] = { - { - .procname = "irix_sgid_inherit", - .data = &xfs_params.sgid_inherit.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.sgid_inherit.min, - .extra2 = &xfs_params.sgid_inherit.max - }, - { - .procname = "irix_symlink_mode", - .data = &xfs_params.symlink_mode.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.symlink_mode.min, - .extra2 = &xfs_params.symlink_mode.max - }, { .procname = "panic_mask", .data = &xfs_params.panic_mask.val, @@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.blockgc_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.blockgc_timer.min, - .extra2 = &xfs_params.blockgc_timer.max, - }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 51646f066c4f..ed9d896079c1 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is - * not a member of parent dir GID. */ - xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ From 07c34f8cef69cb8eeef69c18d6cf0c04fbee3cb3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Apr 2025 18:04:57 -0700 Subject: [PATCH 17/53] xfs: use deferred reaping for data device cow extents Don't roll the whole transaction after every extent, that's rather inefficient. Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/reap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 82910188111d..07f5bb8a6421 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -445,7 +445,7 @@ xreap_agextent_iter( */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); - xreap_force_defer_finish(rs); + xreap_inc_defer(rs); return 0; } @@ -486,7 +486,7 @@ xreap_agextent_iter( if (error) return error; - xreap_force_defer_finish(rs); + xreap_inc_defer(rs); return 0; } From 0ff51a1fd786f47ba435ede6209046959bad54a8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2025 09:31:59 -0700 Subject: [PATCH 18/53] xfs: enable online fsck by default in Kconfig Online fsck has been a part of upstream for over a year now without any serious problems. Turn it on by default in time for the 2025 LTS kernel, and get rid of the "say N if unsure" messages for the default Y options. Signed-off-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino --- fs/xfs/Kconfig | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index ecebd3ebab13..8930d5254e1d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM config XFS_ONLINE_SCRUB bool "XFS online metadata check support" - default n + default y depends on XFS_FS depends on TMPFS && SHMEM select XFS_LIVE_HOOKS @@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB advantage here is to look for problems proactively so that they can be dealt with in a controlled manner. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y @@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS Usage data are collected in /sys/kernel/debug/xfs/scrub. - If unsure, say N. - config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" - default n + default y depends on XFS_FS && XFS_ONLINE_SCRUB select XFS_BTREE_IN_MEM help @@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR formatted with secondary metadata, such as reverse mappings and inode parent pointers. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG From e3df98d30369d7859a855bd3e500ea46205640ca Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 9 Sep 2025 07:04:31 +0700 Subject: [PATCH 19/53] xfs: extend removed sysctls table Commit 21d59d00221e4e ("xfs: remove deprecated sysctl knobs") moves recently-removed sysctls to the removed sysctls table but fails to extend the table, hence triggering Sphinx warning: Documentation/admin-guide/xfs.rst:365: ERROR: Malformed table. Text in column margin in table line 8. ============================= ======= Name Removed ============================= ======= fs.xfs.xfsbufd_centisec v4.0 fs.xfs.age_buffer_centisecs v4.0 fs.xfs.irix_symlink_mode v6.18 fs.xfs.irix_sgid_inherit v6.18 fs.xfs.speculative_cow_prealloc_lifetime v6.18 ============================= ======= [docutils] Extend "Name" column of the table to fit the now-longest sysctl, which is fs.xfs.speculative_cow_prealloc_lifetime. Fixes: 21d59d00221e ("xfs: remove deprecated sysctl knobs") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250908180406.32124fb7@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Reviewed-by: Darrick J. Wong Reviewed-by: Randy Dunlap Signed-off-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index d6f531f2c0e6..c85cd327af28 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -355,15 +355,15 @@ None currently. Removed Sysctls =============== -============================= ======= - Name Removed -============================= ======= - fs.xfs.xfsbufd_centisec v4.0 - fs.xfs.age_buffer_centisecs v4.0 - fs.xfs.irix_symlink_mode v6.18 - fs.xfs.irix_sgid_inherit v6.18 - fs.xfs.speculative_cow_prealloc_lifetime v6.18 -============================= ======= +========================================== ======= + Name Removed +========================================== ======= + fs.xfs.xfsbufd_centisec v4.0 + fs.xfs.age_buffer_centisecs v4.0 + fs.xfs.irix_symlink_mode v6.18 + fs.xfs.irix_sgid_inherit v6.18 + fs.xfs.speculative_cow_prealloc_lifetime v6.18 +========================================== ======= Error handling ============== From eff8668607888988cad7b31528ff08d8883c5d7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:51 -0700 Subject: [PATCH 20/53] xfs: remove the xlog_op_header_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 5 ++--- fs/xfs/xfs_log.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 0d637c276db0..367dfdece9be 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -141,14 +141,13 @@ struct xfs_unmount_log_format { #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -typedef struct xlog_op_header { +struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; +}; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3..94e06c22be67 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2656,10 +2656,11 @@ xlog_state_get_iclog_space( * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -3153,11 +3154,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); unit_bytes += sizeof(xfs_trans_header_t); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3180,12 +3181,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3322,7 +3323,7 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; + struct xlog_op_header *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; void *base_ptr, *ptr, *p; @@ -3400,7 +3401,7 @@ xlog_verify_iclog( op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif From 05f17dcbfd5dbe309af310508d8830ac4e0c5d4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:52 -0700 Subject: [PATCH 21/53] xfs: remove the xfs_trans_header_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_log.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 367dfdece9be..2c3c5e67f78a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -212,12 +212,12 @@ typedef struct xfs_log_iovec { * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ -typedef struct xfs_trans_header { +struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; +}; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 95de23095030..9e712e62369c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -111,7 +111,7 @@ struct xlog_recover_item { struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ + struct xfs_trans_header r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 94e06c22be67..7c590ecdf865 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3155,7 +3155,7 @@ xlog_calc_unit_res( /* for trans header */ unit_bytes += sizeof(struct xlog_op_header); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ unit_bytes += sizeof(struct xlog_op_header); From 476688c8ac60da9bfcb3ce7f5a2d30a145ef7f76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:53 -0700 Subject: [PATCH 22/53] xfs: remove the xfs_extent_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Also fix up the comment about the struct xfs_extent definition to be correct and read more easily. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2c3c5e67f78a..6d0cad455a8f 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -605,16 +605,17 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) /* * EFI/EFD log format definitions */ -typedef struct xfs_extent { +struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; -} xfs_extent_t; +}; /* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. + * Since the structures in struct xfs_extent add up to 96 bytes, it has + * different alignments on i386 vs all other architectures, because i386 + * does not pad structures to their natural alignment. + * + * Provide the different variants for use by a conversion routine. */ typedef struct xfs_extent_32 { uint64_t ext_start; @@ -637,7 +638,7 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[]; /* array of extents to free */ + struct xfs_extent efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; static inline size_t @@ -690,7 +691,7 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[]; /* array of extents freed */ + struct xfs_extent efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; static inline size_t From 7eaf684bc48923b5584fc119e8c477be2cdb3eb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:54 -0700 Subject: [PATCH 23/53] xfs: remove the xfs_extent32_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 6d0cad455a8f..f11ba20a164e 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -617,10 +617,10 @@ struct xfs_extent { * * Provide the different variants for use by a conversion routine. */ -typedef struct xfs_extent_32 { +struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; +} __attribute__((packed)); typedef struct xfs_extent_64 { uint64_t ext_start; @@ -654,7 +654,7 @@ typedef struct xfs_efi_log_format_32 { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[]; /* array of extents to free */ + struct xfs_extent_32 efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; static inline size_t @@ -707,7 +707,7 @@ typedef struct xfs_efd_log_format_32 { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[]; /* array of extents freed */ + struct xfs_extent_32 efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; static inline size_t From 72628b6f459ea4fed3003db8161b52ee746442d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:55 -0700 Subject: [PATCH 24/53] xfs: remove the xfs_extent64_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f11ba20a164e..2b270912e538 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -622,11 +622,11 @@ struct xfs_extent_32 { uint32_t ext_len; } __attribute__((packed)); -typedef struct xfs_extent_64 { +struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; -} xfs_extent_64_t; +}; /* * This is the structure used to lay out an efi log item in the @@ -670,7 +670,7 @@ typedef struct xfs_efi_log_format_64 { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[]; /* array of extents to free */ + struct xfs_extent_64 efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; static inline size_t @@ -723,7 +723,7 @@ typedef struct xfs_efd_log_format_64 { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[]; /* array of extents freed */ + struct xfs_extent_64 efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; static inline size_t From 655d9ec7bd9e38735ae36dbc635a9161a046f7b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:56 -0700 Subject: [PATCH 25/53] xfs: remove the xfs_efi_log_format_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2b270912e538..81c84c8a66fd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -633,13 +633,13 @@ struct xfs_extent_64 { * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ -typedef struct xfs_efi_log_format { +struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ struct xfs_extent efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_t; +}; static inline size_t xfs_efi_log_format_sizeof( diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index c8402040410b..e56376853f2d 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -49,7 +49,7 @@ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - xfs_efi_log_format_t efi_format; + struct xfs_efi_log_format efi_format; }; static inline size_t From 68c9f8444ae930343a2c900cb909825bc8f7304a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:57 -0700 Subject: [PATCH 26/53] xfs: remove the xfs_efi_log_format_32_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 81c84c8a66fd..75cc8b9be598 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -649,13 +649,13 @@ xfs_efi_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efi_log_format_32 { +struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ struct xfs_extent_32 efi_extents[]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; +} __attribute__((packed)); static inline size_t xfs_efi_log_format32_sizeof( diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ee598a9827..0190cc55d75b 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -202,7 +202,7 @@ xfs_efi_copy_format( sizeof(struct xfs_extent)); return 0; } else if (buf->iov_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base; + struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; From 3fe5abc2bf4db88c7c9c99e8a1f5b3d1336d528f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:58 -0700 Subject: [PATCH 27/53] xfs: remove the xfs_efi_log_format_64_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 75cc8b9be598..358f7cb43be2 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -665,13 +665,13 @@ xfs_efi_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efi_log_format_64 { +struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ struct xfs_extent_64 efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_64_t; +}; static inline size_t xfs_efi_log_format64_sizeof( diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0190cc55d75b..418ddab590e0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -216,7 +216,7 @@ xfs_efi_copy_format( } return 0; } else if (buf->iov_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base; + struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; From 0a33d5ad8a46d1f63174d2684b1d743bd6090554 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:26:59 -0700 Subject: [PATCH 28/53] xfs: remove the xfs_efd_log_format_t typedef There are almost no users of the typedef left, kill it and switch the remaining users to use the underlying struct. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/xfs_extfree_item.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 358f7cb43be2..cb63bb156db1 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -686,13 +686,13 @@ xfs_efi_log_format64_sizeof( * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ -typedef struct xfs_efd_log_format { +struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ struct xfs_extent efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_t; +}; static inline size_t xfs_efd_log_format_sizeof( diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index e56376853f2d..af1b0331f7af 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -69,7 +69,7 @@ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; - xfs_efd_log_format_t efd_format; + struct xfs_efd_log_format efd_format; }; static inline size_t From a0cb349672f9ac2dcd80afa3dd25e2df2842db7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:00 -0700 Subject: [PATCH 29/53] xfs: remove the unused xfs_efd_log_format_32_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index cb63bb156db1..2155cc6b2aae 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -702,13 +702,13 @@ xfs_efd_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efd_log_format_32 { +struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ struct xfs_extent_32 efd_extents[]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; +} __attribute__((packed)); static inline size_t xfs_efd_log_format32_sizeof( From 3dde08b64c98cf76b2e2378ecf36351464e2972a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:01 -0700 Subject: [PATCH 30/53] xfs: remove the unused xfs_efd_log_format_64_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2155cc6b2aae..aa8e3b557733 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -718,13 +718,13 @@ xfs_efd_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efd_log_format_64 { +struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ struct xfs_extent_64 efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_64_t; +}; static inline size_t xfs_efd_log_format64_sizeof( From 1b5c7cc8f8c54858f69311290d5ade12627ff233 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:02 -0700 Subject: [PATCH 31/53] xfs: remove the unused xfs_buf_log_format_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index aa8e3b557733..631af2e28c56 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -541,7 +541,7 @@ struct xfs_log_dinode { #define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) #define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) -typedef struct xfs_buf_log_format { +struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ @@ -549,7 +549,7 @@ typedef struct xfs_buf_log_format { int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; +}; /* * All buffers now need to tell recovery where the magic number From ae1ef3272b31e6bccd9f2014e8e8c41887a5137b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:03 -0700 Subject: [PATCH 32/53] xfs: remove the unused xfs_dq_logformat_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 631af2e28c56..fff3a2aaeefd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -957,14 +957,14 @@ struct xfs_xmd_log_format { * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ -typedef struct xfs_dq_logformat { +struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; +}; /* * log format struct for QUOTAOFF records. From bf0013f59ccdb283083f0451f6edc50ff98e68c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:04 -0700 Subject: [PATCH 33/53] xfs: remove the unused xfs_qoff_logformat_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fff3a2aaeefd..49c4a33166a6 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -974,12 +974,12 @@ struct xfs_dq_logformat { * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ -typedef struct xfs_qoff_logformat { +struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; +}; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. From 3e5bdfe48e1f159de7ca3b23a6afa6c10f2a9ad2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:27:05 -0700 Subject: [PATCH 34/53] xfs: remove the unused xfs_log_iovec_t typedef Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 49c4a33166a6..a42a83211724 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -194,12 +194,11 @@ typedef union xlog_in_core2 { } xlog_in_core_2_t; /* not an on-disk structure, but needed by log recovery in userspace */ -typedef struct xfs_log_iovec { +struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ -} xfs_log_iovec_t; - +}; /* * Transaction Header definitions. From 0b737f4ac1d3ec093347241df74bbf5f54a7e16c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:20:29 -0700 Subject: [PATCH 35/53] xfs: rename the old_crc variable in xlog_recover_process old_crc is a very misleading name. Rename it to expected_crc as that described the usage much better. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_log_recover.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e6ed9e09c027..0a4db8efd903 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,20 +2894,19 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2918,11 +2917,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } From e747883c7d7306acb4d683038d881528fbfbe749 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:20:30 -0700 Subject: [PATCH 36/53] xfs: fix log CRC mismatches between i386 and other architectures When mounting file systems with a log that was dirtied on i386 on other architectures or vice versa, log recovery is unhappy: [ 11.068052] XFS (vdb): Torn write (CRC failure) detected at log block 0x2. Truncating head block from 0xc. This is because the CRCs generated by i386 and other architectures always diff. The reason for that is that sizeof(struct xlog_rec_header) returns different values for i386 vs the rest (324 vs 328), because the struct is not sizeof(uint64_t) aligned, and i386 has odd struct size alignment rules. This issue goes back to commit 13cdc853c519 ("Add log versioning, and new super block field for the log stripe") in the xfs-import tree, which adds log v2 support and the h_size field that causes the unaligned size. At that time it only mattered for the crude debug only log header checksum, but with commit 0e446be44806 ("xfs: add CRC checks to the log") it became a real issue for v5 file system, because now there is a proper CRC, and regular builds actually expect it match. Fix this by allowing checksums with and without the padding. Fixes: 0e446be44806 ("xfs: add CRC checks to the log") Cc: # v3.8 Signed-off-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 30 +++++++++++++++++++++++++++++- fs/xfs/libxfs/xfs_ondisk.h | 2 ++ fs/xfs/xfs_log.c | 8 ++++---- fs/xfs/xfs_log_priv.h | 4 ++-- fs/xfs/xfs_log_recover.c | 19 +++++++++++++++++-- 5 files changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a42a83211724..fd00b77af32b 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -173,12 +173,40 @@ typedef struct xlog_rec_header { __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ + + /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ + + /* fields added for log v2: */ __be32 h_size; /* iclog size : 4 */ + + /* + * When h_size added for log v2 support, it caused structure to have + * a different size on i386 vs all other architectures because the + * sum of the size ofthe member is not aligned by that of the largest + * __be64-sized member, and i386 has really odd struct alignment rules. + * + * Due to the way the log headers are placed out on-disk that alone is + * not a problem becaue the xlog_rec_header always sits alone in a + * BBSIZEs area, and the rest of that area is padded with zeroes. + * But xlog_cksum used to calculate the checksum based on the structure + * size, and thus gives different checksums for i386 vs the rest. + * We now do two checksum validation passes for both sizes to allow + * moving v5 file systems with unclean logs between i386 and other + * (little-endian) architectures. + */ + __u32 h_pad0; } xlog_rec_header_t; +#ifdef __i386__ +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) +#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header) +#else +#define XLOG_REC_SIZE sizeof(struct xlog_rec_header) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) +#endif /* __i386__ */ + typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 5ed44fdf7491..7bfa3242e2c5 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7c590ecdf865..2978de9da38e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1568,13 +1568,13 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ @@ -1818,7 +1818,7 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index a9a7a271c15b..0cfc654d8e87 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -499,8 +499,8 @@ xlog_recover_finish( extern void xlog_recover_cancel(struct xlog *); -extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, - char *dp, int size); +__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, unsigned int hdrsize, unsigned int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0a4db8efd903..549d60959aee 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2894,9 +2894,24 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 expected_crc = rhead->h_crc, crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); + + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return From 94deac977fbd0246c971b4f1d17a6385f5e0b1a4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:04 +0000 Subject: [PATCH 37/53] fs: add an enum for number of life time hints Add WRITE_LIFE_HINT_NR into the rw_hint enum to define the number of values write life time hints can be set to. This is useful for e.g. file systems which may want to map these values to allocation groups. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- include/linux/rw_hint.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h index 309ca72f2dfb..adcc43042c90 100644 --- a/include/linux/rw_hint.h +++ b/include/linux/rw_hint.h @@ -14,6 +14,7 @@ enum rw_hint { WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, + WRITE_LIFE_HINT_NR, } __packed; /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ From 0301dae732a5402a68fdb8d8461b97da6b9bccc6 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:05 +0000 Subject: [PATCH 38/53] xfs: refactor hint based zone allocation Replace the co-location code with a matrix that makes it more clear on how the decisions are made. The matrix contains scores for zone/file hint combinations. A "GOOD" score for an open zone will result in immediate co-location while "OK" combinations will only be picked if we cannot open a new zone. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 124 ++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f28214c28ab5..ff24769b8870 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -493,64 +493,64 @@ xfs_try_open_zone( return oz; } -/* - * For data with short or medium lifetime, try to colocated it into an - * already open zone with a matching temperature. - */ -static bool -xfs_colocate_eagerly( - enum rw_hint file_hint) -{ - switch (file_hint) { - case WRITE_LIFE_MEDIUM: - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - return true; - default: - return false; - } -} +enum xfs_zone_alloc_score { + /* Any open zone will do it, we're desperate */ + XFS_ZONE_ALLOC_ANY = 0, -static bool -xfs_good_hint_match( - struct xfs_open_zone *oz, - enum rw_hint file_hint) -{ - switch (oz->oz_write_hint) { - case WRITE_LIFE_LONG: - case WRITE_LIFE_EXTREME: - /* colocate long and extreme */ - if (file_hint == WRITE_LIFE_LONG || - file_hint == WRITE_LIFE_EXTREME) - return true; - break; - case WRITE_LIFE_MEDIUM: - /* colocate medium with medium */ - if (file_hint == WRITE_LIFE_MEDIUM) - return true; - break; - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - case WRITE_LIFE_NOT_SET: - /* colocate short and none */ - if (file_hint <= WRITE_LIFE_SHORT) - return true; - break; - } - return false; -} + /* It better fit somehow */ + XFS_ZONE_ALLOC_OK = 1, + + /* Only reuse a zone if it fits really well. */ + XFS_ZONE_ALLOC_GOOD = 2, +}; + +/* + * Life time hint co-location matrix. Fields not set default to 0 + * aka XFS_ZONE_ALLOC_ANY. + */ +static const unsigned int +xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { + [WRITE_LIFE_NOT_SET] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_NONE] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_SHORT] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_MEDIUM] = { + [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_LONG] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_EXTREME] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, +}; static bool xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, - bool lowspace) + unsigned int goodness) { if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; - if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + + if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) return false; @@ -581,14 +581,14 @@ static struct xfs_open_zone * xfs_select_open_zone_lru( struct xfs_zone_info *zi, enum rw_hint file_hint, - bool lowspace) + unsigned int goodness) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + if (xfs_try_use_zone(zi, file_hint, oz, goodness)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -651,9 +651,11 @@ xfs_select_zone_nowait( * data. */ spin_lock(&zi->zi_open_zones_lock); - if (xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - else if (pack_tight) + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); + if (oz) + goto out_unlock; + + if (pack_tight) oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; @@ -667,16 +669,16 @@ xfs_select_zone_nowait( goto out_unlock; /* - * Try to colocate cold data with other cold data if we failed to open a - * new zone for it. + * Try to find an zone that is an ok match to colocate data with. */ - if (write_hint != WRITE_LIFE_NOT_SET && - !xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); + if (oz) + goto out_unlock; + + /* + * Pick the least recently used zone, regardless of hint match + */ + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; From 8e2cdd8e18ff5073ad76ab2220910001eae39398 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:05 +0000 Subject: [PATCH 39/53] xfs: adjust the hint based zone allocation policy As we really can't make any general assumptions about files that don't have any life time hint set or are set to "NONE", adjust the allocation policy to avoid co-locating data from those files with files with a set life time. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index ff24769b8870..23a027387933 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -512,17 +512,11 @@ static const unsigned int xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { [WRITE_LIFE_NOT_SET] = { [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, - [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, - [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_OK, }, [WRITE_LIFE_NONE] = { - [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, - [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, - [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, }, [WRITE_LIFE_SHORT] = { - [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_GOOD, - [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_GOOD, [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, }, [WRITE_LIFE_MEDIUM] = { From 42c21838708c20dd8ba605e4099bf6a7156c3362 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Sep 2025 06:24:13 -0700 Subject: [PATCH 40/53] xfs: move the XLOG_REG_ constants out of xfs_log_format.h These are purely in-memory values and not used at all in xfsprogs. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_log_format.h | 37 ---------------------------------- fs/xfs/xfs_log.h | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fd00b77af32b..6c50cb2ece19 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -86,43 +86,6 @@ struct xfs_unmount_log_format { uint32_t pad2; /* may as well make it 64 bits */ }; -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_RUI_FORMAT 21 -#define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_CUI_FORMAT 23 -#define XLOG_REG_TYPE_CUD_FORMAT 24 -#define XLOG_REG_TYPE_BUI_FORMAT 25 -#define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_ATTRI_FORMAT 27 -#define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 -#define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_XMI_FORMAT 31 -#define XLOG_REG_TYPE_XMD_FORMAT 32 -#define XLOG_REG_TYPE_ATTR_NEWNAME 33 -#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 -#define XLOG_REG_TYPE_MAX 34 - /* * Flags to log operation header * diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index af6daf4f6792..dcc1f44ed68f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -20,6 +20,43 @@ struct xfs_log_vec { int lv_alloc_size; /* size of allocated lv */ }; +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 + #define XFS_LOG_VEC_ORDERED (-1) /* From d5409ebf46bb0735ba26c64d7a9c80dde3f94eb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:33 -0700 Subject: [PATCH 41/53] xfs: remove xfs_errortag_get xfs_errortag_get is only called by xfs_errortag_attr_show, which does not need to validate the error tag, because it can only be called on valid error tags that had a sysfs attribute registered. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.c | 16 ++-------------- fs/xfs/xfs_error.h | 1 - 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dbd87e137694..45a43e47ce92 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -118,10 +118,9 @@ xfs_errortag_attr_show( char *buf) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; - return snprintf(buf, PAGE_SIZE, "%u\n", - xfs_errortag_get(mp, xfs_attr->tag)); + return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { @@ -326,17 +325,6 @@ xfs_errortag_test( return true; } -int -xfs_errortag_get( - struct xfs_mount *mp, - unsigned int error_tag) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - return mp->m_errortag[error_tag]; -} - int xfs_errortag_set( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0b9c5ba8a598..3aeb03001acf 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -58,7 +58,6 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, unsigned int tag_value); extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); From 991dcadaddcced38c5d2da656862d94a1fc9e6e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:34 -0700 Subject: [PATCH 42/53] xfs: remove xfs_errortag_set xfs_errortag_set is only called by xfs_errortag_attr_store, , which does not need to validate the error tag, because it can only be called on valid error tags that had a sysfs attribute registered. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.c | 29 ++++++----------------------- fs/xfs/xfs_error.h | 3 --- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 45a43e47ce92..fac35ff3da65 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -93,21 +93,18 @@ xfs_errortag_attr_store( size_t count) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; int ret; - unsigned int val; if (strcmp(buf, "default") == 0) { - val = xfs_errortag_random_default[xfs_attr->tag]; + mp->m_errortag[error_tag] = + xfs_errortag_random_default[error_tag]; } else { - ret = kstrtouint(buf, 0, &val); + ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]); if (ret) return ret; } - ret = xfs_errortag_set(mp, xfs_attr->tag, val); - if (ret) - return ret; return count; } @@ -325,19 +322,6 @@ xfs_errortag_test( return true; } -int -xfs_errortag_set( - struct xfs_mount *mp, - unsigned int error_tag, - unsigned int tag_value) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - mp->m_errortag[error_tag] = tag_value; - return 0; -} - int xfs_errortag_add( struct xfs_mount *mp, @@ -347,9 +331,8 @@ xfs_errortag_add( if (!xfs_errortag_valid(error_tag)) return -EINVAL; - - return xfs_errortag_set(mp, error_tag, - xfs_errortag_random_default[error_tag]); + mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag]; + return 0; } int diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3aeb03001acf..fd60a008f9d2 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -58,8 +58,6 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, - unsigned int tag_value); extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_clearall(struct xfs_mount *mp); #else @@ -67,7 +65,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_del(mp) #define XFS_TEST_ERROR(expr, mp, tag) (expr) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) -#define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ From 807df3227d7674d7957c576551d552acf15bb96f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:35 -0700 Subject: [PATCH 43/53] xfs: remove the expr argument to XFS_TEST_ERROR Don't pass expr to XFS_TEST_ERROR. Most calls pass a constant false, and the places that do pass an expression become cleaner by moving it out. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_ag_resv.c | 7 +++---- fs/xfs/libxfs/xfs_alloc.c | 5 ++--- fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 17 ++++++++--------- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_da_btree.c | 2 +- fs/xfs/libxfs/xfs_dir2.c | 2 +- fs/xfs/libxfs/xfs_exchmaps.c | 4 ++-- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/libxfs/xfs_inode_buf.c | 4 ++-- fs/xfs/libxfs/xfs_inode_fork.c | 3 +-- fs/xfs/libxfs/xfs_metafile.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 7 +++---- fs/xfs/libxfs/xfs_rmap.c | 2 +- fs/xfs/libxfs/xfs_rtbitmap.c | 2 +- fs/xfs/scrub/cow_repair.c | 4 ++-- fs/xfs/scrub/repair.c | 2 +- fs/xfs/xfs_attr_item.c | 2 +- fs/xfs/xfs_buf.c | 4 ++-- fs/xfs/xfs_error.c | 5 ++--- fs/xfs/xfs_error.h | 10 +++++----- fs/xfs/xfs_inode.c | 28 +++++++++++++--------------- fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_log.c | 8 ++++---- fs/xfs/xfs_trans_ail.c | 2 +- 25 files changed, 62 insertions(+), 70 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fb79215a509d..8ac8230c3d3c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -92,9 +92,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || - avail < mp->m_agbtree_maxlevels, - mp, XFS_ERRTAG_AG_RESV_CRITICAL); + return avail < orig / 10 || avail < mp->m_agbtree_maxlevels || + XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -203,7 +202,7 @@ __xfs_ag_resv_init( return -EINVAL; } - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else error = xfs_dec_fdblocks(mp, hidden_space, true); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 000cc7f4a3ce..ad381c73abc4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3321,7 +3321,7 @@ xfs_agf_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -4019,8 +4019,7 @@ __xfs_free_extent( ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 47213e6023dd..91c1b30ebaab 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1212,7 +1212,7 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 80bdb537fcf7..53ef4b7e504d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3654,8 +3654,7 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) @@ -3841,7 +3840,7 @@ xfs_bmapi_read( } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4192,7 +4191,7 @@ xfs_bmapi_write( (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4537,7 +4536,7 @@ xfs_bmapi_remap( (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5671,7 +5670,7 @@ xfs_bmap_collapse_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5787,7 +5786,7 @@ xfs_bmap_insert_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5892,7 +5891,7 @@ xfs_bmap_split_extent( int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6057,7 +6056,7 @@ xfs_bmap_finish_one( trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a61211d253f1..dbe9df8c3300 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -306,7 +306,7 @@ xfs_btree_check_block( fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 723a0643b838..90f7fc219fcc 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -565,7 +565,7 @@ xfs_da3_split( trace_xfs_da_split(state->args); - if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 1775abcfa04d..82a338458a51 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -223,7 +223,7 @@ xfs_dir_ino_validate( bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 3f1d6a98c118..932ee4619e9e 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -616,7 +616,7 @@ xfs_exchmaps_finish_one( return error; } - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ @@ -882,7 +882,7 @@ xmi_ensure_delta_nextents( &new_nextents)) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && new_nextents > 10) return -EFBIG; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 5fefdd4fe75d..d97295eaebe6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2706,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index aa13fc00afd7..b1812b2c3cce 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -61,8 +61,8 @@ xfs_inode_buf_verify( di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP))) { + if (unlikely(!di_ok || + XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 4f99b90add55..1772d82f2d68 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,8 +756,7 @@ xfs_iext_count_extend( if (nr_exts < ifp->if_nextents) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && - nr_exts > 10) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10) return -EFBIG; if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 225923e463c4..b02e3d6c0868 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -121,7 +121,7 @@ xfs_metafile_resv_critical( div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 897784037483..2484dc9f6d7e 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space( * refcount continue update "error" has been injected. */ if (cur->bc_refc.nr_ops > 2 && - XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) + XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_refc.nr_ops == 0) @@ -1398,7 +1397,7 @@ xfs_refcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3cdf50563fec..83e0488ff773 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2690,7 +2690,7 @@ xfs_rmap_finish_one( trace_xfs_rmap_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5057536e586c..618061d898d4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1067,7 +1067,7 @@ xfs_rtfree_extent( ASSERT(rbmip->i_itemp != NULL); xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 38a246b8bf11..b2a83801412e 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,7 +300,7 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) @@ -385,7 +385,7 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d00c18954a26..efd5a7ccdf62 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1110,7 +1110,7 @@ xrep_will_attempt( return true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) return true; /* Metadata is corrupt or failed cross-referencing. */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5eef3bc30bda..c3a593319bee 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -491,7 +491,7 @@ xfs_attr_finish_item( /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f9ef3b2a332a..8360e77b3215 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1299,7 +1299,7 @@ xfs_buf_bio_end_io( if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { @@ -2084,7 +2084,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index fac35ff3da65..44dd8aba0097 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -291,7 +291,6 @@ xfs_errortag_enabled( bool xfs_errortag_test( struct xfs_mount *mp, - const char *expression, const char *file, int line, unsigned int error_tag) @@ -317,8 +316,8 @@ xfs_errortag_test( return false; xfs_warn_ratelimited(mp, -"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_super->s_id); +"Injecting error at file %s, line %d, on filesystem \"%s\"", + file, line, mp->m_super->s_id); return true; } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index fd60a008f9d2..8429c1ee86e7 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -41,10 +41,10 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); extern void xfs_errortag_del(struct xfs_mount *mp); -extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, - const char *file, int line, unsigned int error_tag); -#define XFS_TEST_ERROR(expr, mp, tag) \ - ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, + unsigned int error_tag); +#define XFS_TEST_ERROR(mp, tag) \ + xfs_errortag_test((mp), __FILE__, __LINE__, (tag)) bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); #define XFS_ERRORTAG_DELAY(mp, tag) \ do { \ @@ -63,7 +63,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) -#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_TEST_ERROR(mp, tag) (false) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0ddb9ce0f5e3..2bfe87b09c2a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2381,8 +2381,8 @@ xfs_iflush( * error handling as the caller will shutdown and fail the buffer. */ error = -EFSCORRUPTED; - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1)) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -2398,29 +2398,27 @@ xfs_iflush( goto flush_out; } } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE, - mp, XFS_ERRTAG_IFLUSH_3)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE && - ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, - mp, XFS_ERRTAG_IFLUSH_4)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > - ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > + ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, @@ -2429,8 +2427,8 @@ xfs_iflush( ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6)) { + if (ip->i_forkoff > mp->m_sb.sb_inodesize || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2a74f2957341..2570d0a66047 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1554,7 +1554,7 @@ xfs_zoned_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1728,7 +1728,7 @@ xfs_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2978de9da38e..6397a45b293d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -969,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1240,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1827,7 +1827,7 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 67c328d23e4a..38983c6777df 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -374,7 +374,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* From b55dd72798115015908f4a17a1f8d70e8e974ab4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:36 -0700 Subject: [PATCH 44/53] xfs: remove pointless externs in xfs_error.h Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.h | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 8429c1ee86e7..fe6a71bbe9cd 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -8,22 +8,17 @@ struct xfs_mount; -extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, - xfs_failaddr_t failaddr); -extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, const void *buf, size_t bufsize, - const char *filename, int linenum, - xfs_failaddr_t failaddr); +void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, + const void *buf, size_t bufsize, const char *filename, + int linenum, xfs_failaddr_t failaddr); void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); -extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); -extern void xfs_verifier_error(struct xfs_buf *bp, int error, - xfs_failaddr_t failaddr); -extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); +void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); +void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); +void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -39,8 +34,8 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_CORRUPTION_DUMP_LEN (128) #ifdef DEBUG -extern int xfs_errortag_init(struct xfs_mount *mp); -extern void xfs_errortag_del(struct xfs_mount *mp); +int xfs_errortag_init(struct xfs_mount *mp); +void xfs_errortag_del(struct xfs_mount *mp); bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, unsigned int error_tag); #define XFS_TEST_ERROR(mp, tag) \ @@ -58,8 +53,8 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_clearall(struct xfs_mount *mp); +int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) From 71fa062196ae3abab790c91f1bdf09dcdc6fb1fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Sep 2025 07:34:37 -0700 Subject: [PATCH 45/53] xfs: centralize error tag definitions Right now 5 places in the kernel and one in xfsprogs need to be updated for each new error tag. Add a bit of macro magic so that only the error tag definition and a single table, which reside next to each other, need to be updated. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_errortag.h | 114 ++++++++++++++---------- fs/xfs/xfs_error.c | 166 +++++------------------------------ 2 files changed, 87 insertions(+), 193 deletions(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a53c5d40e084..de840abc0bcd 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -4,14 +4,22 @@ * Copyright (C) 2017 Oracle. * All Rights Reserved. */ -#ifndef __XFS_ERRORTAG_H_ +#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG) #define __XFS_ERRORTAG_H_ /* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number + * There are two ways to use this header file. The first way is to #include it + * bare, which will define all the XFS_ERRTAG_* error injection knobs for use + * with the XFS_TEST_ERROR macro. The second way is to enclose the #include + * with a #define for an XFS_ERRTAG macro, in which case the header will define + " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each + * defined error injection knob. */ +/* + * These are the actual error injection tags. The numbers should be consecutive + * because arrays are sized based on the maximum. + */ #define XFS_ERRTAG_NOERROR 0 #define XFS_ERRTAG_IFLUSH_1 1 #define XFS_ERRTAG_IFLUSH_2 2 @@ -71,49 +79,61 @@ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. */ #define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 -#define XFS_RANDOM_BUF_LRU_REF 2 -#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 -#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 -#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT -#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 -#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 -#define XFS_RANDOM_AG_RESV_FAIL 1 -#define XFS_RANDOM_LARP 1 -#define XFS_RANDOM_DA_LEAF_SPLIT 1 -#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 -#define XFS_RANDOM_WB_DELAY_MS 3000 -#define XFS_RANDOM_WRITE_DELAY_MS 3000 -#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 -#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 + +/* + * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are: + * 1. The XFS_ERRTAG_ flag but without the prefix; + * 2. The name of the sysfs knob; and + * 3. The default value for the knob. + */ +#ifdef XFS_ERRTAG +# undef XFS_ERRTAGS +# define XFS_ERRTAGS \ +XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \ +XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \ +XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \ +XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \ +XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \ +XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \ +XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \ +XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \ +XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \ +XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \ +XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \ +XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \ +XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \ +XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \ +XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \ +XFS_ERRTAG(LARP, larp, 1) \ +XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \ +XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ +XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ +XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ +XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +#endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 44dd8aba0097..ac895cd2bc0a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -10,61 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG -static unsigned int xfs_errortag_random_default[] = { - XFS_RANDOM_DEFAULT, - XFS_RANDOM_IFLUSH_1, - XFS_RANDOM_IFLUSH_2, - XFS_RANDOM_IFLUSH_3, - XFS_RANDOM_IFLUSH_4, - XFS_RANDOM_IFLUSH_5, - XFS_RANDOM_IFLUSH_6, - XFS_RANDOM_DA_READ_BUF, - XFS_RANDOM_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK, - XFS_RANDOM_ALLOC_READ_AGF, - XFS_RANDOM_IALLOC_READ_AGI, - XFS_RANDOM_ITOBP_INOTOBP, - XFS_RANDOM_IUNLINK, - XFS_RANDOM_IUNLINK_REMOVE, - XFS_RANDOM_DIR_INO_VALIDATE, - XFS_RANDOM_BULKSTAT_READ_CHUNK, - XFS_RANDOM_IODONE_IOERR, - XFS_RANDOM_STRATREAD_IOERR, - XFS_RANDOM_STRATCMPL_IOERR, - XFS_RANDOM_DIOWRITE_IOERR, - XFS_RANDOM_BMAPIFORMAT, - XFS_RANDOM_FREE_EXTENT, - XFS_RANDOM_RMAP_FINISH_ONE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE, - XFS_RANDOM_AG_RESV_CRITICAL, - 0, /* XFS_RANDOM_DROP_WRITES has been removed */ - XFS_RANDOM_LOG_BAD_CRC, - XFS_RANDOM_LOG_ITEM_PIN, - XFS_RANDOM_BUF_LRU_REF, - XFS_RANDOM_FORCE_SCRUB_REPAIR, - XFS_RANDOM_FORCE_SUMMARY_RECALC, - XFS_RANDOM_IUNLINK_FALLBACK, - XFS_RANDOM_BUF_IOERROR, - XFS_RANDOM_REDUCE_MAX_IEXTENTS, - XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, - XFS_RANDOM_AG_RESV_FAIL, - XFS_RANDOM_LARP, - XFS_RANDOM_DA_LEAF_SPLIT, - XFS_RANDOM_ATTR_LEAF_TO_NODE, - XFS_RANDOM_WB_DELAY_MS, - XFS_RANDOM_WRITE_DELAY_MS, - XFS_RANDOM_EXCHMAPS_FINISH_ONE, - XFS_RANDOM_METAFILE_RESV_CRITICAL, -}; +#define XFS_ERRTAG(_tag, _name, _default) \ + [XFS_ERRTAG_##_tag] = (_default), +#include "xfs_errortag.h" +static unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +#undef XFS_ERRTAG struct xfs_errortag_attr { struct attribute attr; @@ -125,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = { .store = xfs_errortag_attr_store, }; -#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +#define XFS_ERRTAG(_tag, _name, _default) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ - .tag = (_tag), \ -} - -#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr - -XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); -XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); -XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); -XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); -XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); -XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); -XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); -XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); -XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); -XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); -XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); -XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); -XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); -XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); -XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); -XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); -XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); -XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); -XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); -XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); -XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); -XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); -XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); -XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); -XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); -XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); -XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); -XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); -XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); -XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); -XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); -XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); -XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); -XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); -XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); -XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); -XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); -XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + .tag = XFS_ERRTAG_##_tag, \ +}; +#include "xfs_errortag.h" +XFS_ERRTAGS +#undef XFS_ERRTAG +#define XFS_ERRTAG(_tag, _name, _default) \ + &xfs_errortag_attr_##_name.attr, +#include "xfs_errortag.h" static struct attribute *xfs_errortag_attrs[] = { - XFS_ERRORTAG_ATTR_LIST(noerror), - XFS_ERRORTAG_ATTR_LIST(iflush1), - XFS_ERRORTAG_ATTR_LIST(iflush2), - XFS_ERRORTAG_ATTR_LIST(iflush3), - XFS_ERRORTAG_ATTR_LIST(iflush4), - XFS_ERRORTAG_ATTR_LIST(iflush5), - XFS_ERRORTAG_ATTR_LIST(iflush6), - XFS_ERRORTAG_ATTR_LIST(dareadbuf), - XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), - XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), - XFS_ERRORTAG_ATTR_LIST(readagf), - XFS_ERRORTAG_ATTR_LIST(readagi), - XFS_ERRORTAG_ATTR_LIST(itobp), - XFS_ERRORTAG_ATTR_LIST(iunlink), - XFS_ERRORTAG_ATTR_LIST(iunlinkrm), - XFS_ERRORTAG_ATTR_LIST(dirinovalid), - XFS_ERRORTAG_ATTR_LIST(bulkstat), - XFS_ERRORTAG_ATTR_LIST(logiodone), - XFS_ERRORTAG_ATTR_LIST(stratread), - XFS_ERRORTAG_ATTR_LIST(stratcmpl), - XFS_ERRORTAG_ATTR_LIST(diowrite), - XFS_ERRORTAG_ATTR_LIST(bmapifmt), - XFS_ERRORTAG_ATTR_LIST(free_extent), - XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), - XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), - XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(log_bad_crc), - XFS_ERRORTAG_ATTR_LIST(log_item_pin), - XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), - XFS_ERRORTAG_ATTR_LIST(force_repair), - XFS_ERRORTAG_ATTR_LIST(bad_summary), - XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), - XFS_ERRORTAG_ATTR_LIST(buf_ioerror), - XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), - XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), - XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), - XFS_ERRORTAG_ATTR_LIST(larp), - XFS_ERRORTAG_ATTR_LIST(da_leaf_split), - XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), - XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), - XFS_ERRORTAG_ATTR_LIST(write_delay_ms), - XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), - XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), - NULL, + XFS_ERRTAGS + NULL }; ATTRIBUTE_GROUPS(xfs_errortag); +#undef XFS_ERRTAG + +/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */ +static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, From 8e1cfa51320da0cf599d286c89db043f329ca6b0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Sep 2025 22:01:10 +0900 Subject: [PATCH 46/53] xfs: improve zone statistics message Reword the information message displayed in xfs_mount_zones() indicating the total zone count and maximum number of open zones. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 23a027387933..f152b2182004 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1244,7 +1244,7 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks size (%u max open)", + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); From ff3d90903f8f525eedb26efe6fea03c39476cb69 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Sep 2025 22:01:11 +0900 Subject: [PATCH 47/53] xfs: improve default maximum number of open zones For regular block devices using the zoned allocator, the default maximum number of open zones is set to 1/4 of the number of realtime groups. For a large capacity device, this leads to a very large limit. E.g. with a 26 TB HDD: mount /dev/sdb /mnt ... XFS (sdb): 95836 zones of 65536 blocks size (23959 max open) In turn such large limit on the number of open zones can lead, depending on the workload, on a very large number of concurrent write streams which devices generally do not handle well, leading to poor performance. Introduce the default limit XFS_DEFAULT_MAX_OPEN_ZONES, defined as 128 to match the hardware limit of most SMR HDDs available today, and use this limit to set mp->m_max_open_zones in xfs_calc_open_zones() instead of calling xfs_max_open_zones(), when the user did not specify a limit with the max_open_zones mount option. For the 26 TB HDD example, we now get: mount /dev/sdb /mnt ... XFS (sdb): 95836 zones of 65536 blocks (128 max open zones) This change does not prevent the user from specifying a lareger number for the open zones limit. E.g. mount -o max_open_zones=4096 /dev/sdb /mnt ... XFS (sdb): 95836 zones of 65536 blocks (4096 max open zones) Finally, since xfs_calc_open_zones() checks and caps the mp->m_max_open_zones limit against the value calculated by xfs_max_open_zones() for any type of device, this new default limit does not increase m_max_open_zones for small capacity devices. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_zones.h | 7 +++++++ fs/xfs/xfs_zone_alloc.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h index c4f1367b2cca..5fefd132e002 100644 --- a/fs/xfs/libxfs/xfs_zones.h +++ b/fs/xfs/libxfs/xfs_zones.h @@ -29,6 +29,13 @@ struct xfs_rtgroup; #define XFS_OPEN_GC_ZONES 1U #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) +/* + * For zoned devices that do not have a limit on the number of open zones, and + * for regular devices using the zoned allocator, use the most common SMR disks + * limit (128) as the default limit on the number of open zones. + */ +#define XFS_DEFAULT_MAX_OPEN_ZONES 128 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, xfs_rgblock_t *write_pointer); diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index f152b2182004..1147bacb2da8 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1131,7 +1131,7 @@ xfs_calc_open_zones( if (bdev_open_zones) mp->m_max_open_zones = bdev_open_zones; else - mp->m_max_open_zones = xfs_max_open_zones(mp); + mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; } if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { From 3c54e6027f14c4f54be5508af748f6cc2fd72f89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Sep 2025 06:14:11 -0700 Subject: [PATCH 48/53] xfs: constify xfs_errortag_random_default This table is never modified, so mark it const. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ac895cd2bc0a..39830b252ac8 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -19,7 +19,7 @@ #define XFS_ERRTAG(_tag, _name, _default) \ [XFS_ERRTAG_##_tag] = (_default), #include "xfs_errortag.h" -static unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; #undef XFS_ERRTAG struct xfs_errortag_attr { From 42852fe57c6d2a0abb10429841cb1226b7186b7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Sep 2025 06:12:08 -0700 Subject: [PATCH 49/53] xfs: track the number of blocks in each buftarg Add a bt_nr_sectors to track the number of sector in each buftarg, and replace the check that hard codes sb_dblock in xfs_buf_map_verify with this new value so that it is correct for non-ddev buftargs. The RT buftarg only has a superblock in the first block, so it is unlikely to trigger this, or are we likely to ever have enough blocks in the in-memory buftargs, but we might as well get the check right. Fixes: 10616b806d1d ("xfs: fix _xfs_buf_find oops on blocks beyond the filesystem end") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 42 +++++++++++++++++++---------------- fs/xfs/xfs_buf.h | 4 +++- fs/xfs/xfs_buf_item_recover.c | 10 +++++++++ fs/xfs/xfs_super.c | 7 +++--- fs/xfs/xfs_trans.c | 23 ++++++++++--------- 5 files changed, 52 insertions(+), 34 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8360e77b3215..773d959965dc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -387,8 +387,6 @@ xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { - xfs_daddr_t eofs; - /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); @@ -397,11 +395,10 @@ xfs_buf_map_verify( * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (map->bm_bn < 0 || map->bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, map->bm_bn, eofs); + __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } @@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes( int xfs_configure_buftarg( struct xfs_buftarg *btp, - unsigned int sectorsize) + unsigned int sectorsize, + xfs_rfsblock_t nr_blocks) { - int error; + struct xfs_mount *mp = btp->bt_mount; - ASSERT(btp->bt_bdev != NULL); + if (btp->bt_bdev) { + int error; - /* Set up metadata sector size info */ - btp->bt_meta_sectorsize = sectorsize; - btp->bt_meta_sectormask = sectorsize - 1; + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { + xfs_warn(mp, + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); + return -EINVAL; + } - error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); - if (error) { - xfs_warn(btp->bt_mount, - "Cannot use blocksize %u on device %pg, err %d", - sectorsize, btp->bt_bdev, error); - return -EINVAL; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); } - if (bdev_can_atomic_write(btp->bt_bdev)) - xfs_configure_buftarg_atomic_writes(btp); + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + /* m_blkbb_log is not set up yet */ + btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } @@ -1749,6 +1750,9 @@ xfs_init_buftarg( size_t logical_sectorsize, const char *descr) { + /* The maximum size of the buftarg is only known once the sb is read. */ + btp->bt_nr_sectors = (xfs_daddr_t)-1; + /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b269e115d9ac..8fa7bdf59c91 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -103,6 +103,7 @@ struct xfs_buftarg { size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; + xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; @@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, + xfs_fsblock_t nr_blocks); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 5d58e2ae4972..e4c8af873632 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer( */ xfs_sb_from_disk(&mp->m_sb, dsb); + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + if (mp->m_sb.sb_agcount < orig_agcount) { xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 77acb3e5a4ec..9e759c5b1096 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -535,7 +535,8 @@ xfs_setup_devices( { int error; - error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; @@ -545,7 +546,7 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_configure_buftarg(mp->m_logdev_targp, - log_sector_size); + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } @@ -559,7 +560,7 @@ xfs_setup_devices( mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { error = xfs_configure_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 575e7028f423..474f5a04ec63 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -452,19 +452,17 @@ xfs_trans_mod_sb( */ STATIC void xfs_trans_apply_sb_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { - struct xfs_dsb *sbp; - struct xfs_buf *bp; - int whole = 0; - - bp = xfs_trans_getsb(tp); - sbp = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp); + struct xfs_dsb *sbp = bp->b_addr; + int whole = 0; /* * Only update the superblock counters if we are logging them */ - if (!xfs_has_lazysbcount((tp->t_mountp))) { + if (!xfs_has_lazysbcount(mp)) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas( * write the correct value ondisk. */ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && - !xfs_has_rtgroups(tp->t_mountp)) { - struct xfs_mount *mp = tp->t_mountp; + !xfs_has_rtgroups(mp)) { int64_t rtxdelta; rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; @@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas( if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + mp->m_ddev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_dblocks_delta); whole = 1; } if (tp->t_agcount_delta) { @@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas( * recompute the ondisk rtgroup block log. The incore values * will be recomputed in xfs_trans_unreserve_and_mod_sb. */ - if (xfs_has_rtgroups(tp->t_mountp)) { + if (xfs_has_rtgroups(mp)) { sbp->sb_rgblklog = xfs_compute_rgblklog( be32_to_cpu(sbp->sb_rgextents), be32_to_cpu(sbp->sb_rextsize)); @@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rblocks_delta) { be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + mp->m_rtdev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_rblocks_delta); whole = 1; } if (tp->t_rextents_delta) { From 6ef2175fce30ccab80d519f2afcc93d8b138c16c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Sep 2025 06:12:09 -0700 Subject: [PATCH 50/53] xfs: use bt_nr_sectors in xfs_dax_translate_range Only ranges inside the file system can be translated, and the file system can be smaller than the containing device. Fixes: f4ed93037966 ("xfs: don't shut down the filesystem for media failures beyond end of log") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_notify_failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index fbeddcac4792..b17672889942 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -165,7 +165,7 @@ xfs_dax_translate_range( uint64_t *bblen) { u64 dev_start = btp->bt_dax_part_off; - u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_len = BBTOB(btp->bt_nr_sectors); u64 dev_end = dev_start + dev_len - 1; /* Notify failure on the whole device. */ From fc0d192303bd385ac24dc52eb31ceb6ca7e027d0 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 18 Sep 2025 14:14:03 +0300 Subject: [PATCH 51/53] xfs: scrub: use kstrdup_const() for metapath scan setups Except 'xchk_setup_metapath_rtginode()' case, 'path' argument of 'xchk_setup_metapath_scan()' is a compile-time constant. So it may be reasonable to use 'kstrdup_const()' / 'kree_const()' to manage 'path' field of 'struct xchk_metapath' in attempt to reuse .rodata instance rather than making a copy. Compile tested only. Signed-off-by: Dmitry Antipov Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/metapath.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index 14939d7de349..378ec7c8d38e 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -79,7 +79,7 @@ xchk_metapath_cleanup( if (mpath->dp_ilock_flags) xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); - kfree(mpath->path); + kfree_const(mpath->path); } /* Set up a metadir path scan. @path must be dynamically allocated. */ @@ -98,13 +98,13 @@ xchk_setup_metapath_scan( error = xchk_install_live_inode(sc, ip); if (error) { - kfree(path); + kfree_const(path); return error; } mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); if (!mpath) { - kfree(path); + kfree_const(path); return -ENOMEM; } @@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); + kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip); } /* Scan a rtgroup inode under the /rtgroups directory. */ @@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kstrdup("quota", GFP_KERNEL), qi->qi_dirip); + kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode( return -ENOENT; return xchk_setup_metapath_scan(sc, qi->qi_dirip, - kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); + kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip); } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) From bc7d684fea18cc48c3630d2b7f1789000ff2df5b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 18 Sep 2025 08:12:53 +1000 Subject: [PATCH 52/53] xfs: rearrange code in xfs_inode_item_precommit There are similar extsize checks and updates done inside and outside the inode item lock, which could all be done under a single top level logic branch outside the ili_lock. The COW extsize fixup can potentially miss updating the XFS_ILOG_CORE in ili_fsync_fields, so moving this code up above the ili_fsync_fields update could also be considered a fix. Further, to make the next change a bit cleaner, move where we calculate the on-disk flag mask to after we attach the cluster buffer to the the inode log item. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode_item.c | 65 ++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 829675700fcd..678ca95793e0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -131,46 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - - /* - * Inode verifiers do not check that the CoW extent size hint is an - * integer multiple of the rt extent size on a directory with both - * rtinherit and cowextsize flags set. If we're logging a directory - * that is misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - flags |= XFS_ILOG_CORE; - } - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -204,6 +186,17 @@ xfs_inode_item_precommit( xfs_trans_brelse(tp, bp); } + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it + * to XFS_ILOG_CORE so that the actual on-disk dirty tracking + * (ili_fields) correctly tracks that the version has changed. + */ + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines From c91d38b57f2c4784d885c874b2a1234a01361afd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 18 Sep 2025 08:12:54 +1000 Subject: [PATCH 53/53] xfs: rework datasync tracking and execution Jan Kara reported that the shared ILOCK held across the journal flush during fdatasync operations slows down O_DSYNC DIO on unwritten extents significantly. The underlying issue is that unwritten extent conversion needs the ILOCK exclusive, whilst the datasync operation after the extent conversion holds it shared. Hence we cannot be flushing the journal for one IO completion whilst at the same time doing unwritten extent conversion on another IO completion on the same inode. This means that IO completions lock-step, and IO performance is dependent on the journal flush latency. Jan demonstrated that reducing the ifdatasync lock hold time can improve O_DSYNC DIO to unwritten extents performance by 2.5x. Discussion on that patch found issues with the method, and we came to the conclusion that separately tracking datasync flush sequences was the best approach to solving the problem. The fsync code uses the ILOCK to serialise against concurrent modifications in the transaction commit phase. In a transaction commit, there are several disjoint updates to inode log item state that need to be considered atomically by the fsync code. These operations are all done under ILOCK_EXCL context: 1. ili_fsync_flags is updated in ->iop_precommit 2. i_pincount is updated in ->iop_pin before it is added to the CIL 3. ili_commit_seq is updated in ->iop_committing, after it has been added to the CIL In fsync, we need to: 1. check that the inode is dirty in the journal (ipincount) 2. check that ili_fsync_flags is set 3. grab the ili_commit_seq if a journal flush is needed 4. clear the ili_fsync_flags to ensure that new modifications that require fsync are tracked in ->iop_precommit correctly The serialisation of ipincount/ili_commit_seq is needed to ensure that we don't try to unnecessarily flush the journal. The serialisation of ili_fsync_flags being set in ->iop_precommit and cleared in fsync post journal flush is required for correctness. Hence holding the ILOCK_SHARED in xfs_file_fsync() performs all this serialisation for us. Ideally, we want to remove the need to hold the ILOCK_SHARED in xfs_file_fsync() for best performance. We start with the observation that fsync/fdatasync() only need to wait for operations that have been completed. Hence operations that are still being committed have not completed and datasync operations do not need to wait for them. This means we can use a single point in time in the commit process to signal "this modification is complete". This is what ->iop_committing is supposed to provide - it is the point at which the object is unlocked after the modification has been recorded in the CIL. Hence we could use ili_commit_seq to determine if we should flush the journal. In theory, we can already do this. However, in practice this will expose an internal global CIL lock to the IO path. The ipincount() checks optimise away the need to take this lock - if the inode is not pinned, then it is not in the CIL and we don't need to check if a journal flush at ili_commit_seq needs to be performed. The reason this is needed is that the ili_commit_seq is never cleared. Once it is set, it remains set even once the journal has been committed and the object has been unpinned. Hence we have to look that journal internal commit sequence state to determine if ili_commit_seq needs to be acted on or not. We can solve this by clearing ili_commit_seq when the inode is unpinned. If we clear it atomically with the last unpin going away, then we are guaranteed that new modifications will order correctly as they add a new pin counts and we won't clear a sequence number for an active modification in the CIL. Further, we can then allow the per-transaction flag state to propagate into ->iop_committing (instead of clearing it in ->iop_precommit) and that will allow us to determine if the modification needs a full fsync or just a datasync, and so we can record a separate datasync sequence number (Jan's idea!) and then use that in the fdatasync path instead of the full fsync sequence number. With this infrastructure in place, we no longer need the ILOCK_SHARED in the fsync path. All serialisation is done against the commit sequence numbers - if the sequence number is set, then we have to flush the journal. If it is not set, then we have nothing to do. This greatly simplifies the fsync implementation.... Signed-off-by: Dave Chinner Tested-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_file.c | 75 +++++++++++++++++++---------------------- fs/xfs/xfs_inode.c | 25 +++++++++----- fs/xfs/xfs_inode_item.c | 72 ++++++++++++++++++++++++++++++--------- fs/xfs/xfs_inode_item.h | 10 +++++- fs/xfs/xfs_iomap.c | 15 +++++++-- 5 files changed, 128 insertions(+), 69 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f96fbf5c54c9..2702fef2c90c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -75,52 +75,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. + * + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, log_flushed); - - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; } STATIC int @@ -158,12 +153,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2bfe87b09c2a..047a5260cf70 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1667,7 +1667,6 @@ xfs_ifree_mark_inode_stale( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1832,12 +1831,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2504,7 +2511,6 @@ xfs_iflush( spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2663,12 +2669,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 678ca95793e0..1bd411a1114c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -187,13 +187,16 @@ xfs_inode_item_precommit( } /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. */ - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); if (flags & XFS_ILOG_IVERSION) flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); @@ -207,12 +210,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -722,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -851,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1048,7 +1089,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index ba92ce11a011..2ddcca41714f 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2570d0a66047..d3f6e3e42a11 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -149,9 +149,18 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; return 0;