From 915175b49f65d9edeb81659e82cbb27b621dbc17 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Wed, 15 Jan 2025 20:35:25 +0800 Subject: [PATCH 1/6] xfs: fix the entry condition of exact EOF block allocation optimization When we call create(), lseek() and write() sequentially, offset != 0 cannot be used as a judgment condition for whether the file already has extents. Furthermore, when xfs_bmap_adjacent() has not given a better blkno, it is not necessary to use exact EOF block allocation. Suggested-by: Dave Chinner Signed-off-by: Jinliang Zheng Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_bmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 40ad22fb808b..0ef19f1469ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3563,12 +3563,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3736,7 +3736,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for From 89841b23809f5fb12cbead142204064739fef25a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jan 2025 07:03:35 +0100 Subject: [PATCH 2/6] xfs: remove an out of data comment in _xfs_buf_alloc There hasn't been anything like an io_length for a long time. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 7fbdd4b30676..f1252ed8bd0a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -232,11 +232,6 @@ _xfs_buf_alloc( bp->b_mount = target->bt_mount; bp->b_flags = flags; - /* - * Set length and io_length to the same value initially. - * I/O routines should use io_length, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ error = xfs_buf_get_maps(bp, nmaps); if (error) { kmem_cache_free(xfs_buf_cache, bp); From f5f0ed89f13e3e5246404a322ee85169a226bfb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Jan 2025 06:43:21 +0100 Subject: [PATCH 3/6] xfs: don't call remap_verify_area with sb write protection held The XFS_IOC_EXCHANGE_RANGE ioctl with the XFS_EXCHANGE_RANGE_TO_EOF flag operates on a range bounded by the end of the file. This means the actual amount of blocks exchanged is derived from the inode size, which is only stable with the IOLOCK (i_rwsem) held. Do that, it currently calls remap_verify_area from inside the sb write protection which nests outside the IOLOCK. But this makes fsnotify_file_area_perm which is called from remap_verify_area unhappy when the kernel is built with lockdep and the recently added CONFIG_FANOTIFY_ACCESS_PERMISSIONS option. Fix this by always calling remap_verify_area before taking the write protection, and passing a 0 size to remap_verify_area similar to the FICLONE/FICLONERANGE ioctls when they are asked to clone until the file end. (Note: the size argument gets passed to fsnotify_file_area_perm, but then isn't actually used there). Fixes: 9a64d9b3109d ("xfs: introduce new file range exchange ioctl") Cc: # v6.10 Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_exchrange.c | 71 ++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index f340a2015c4c..0b41bdfecdfb 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -329,22 +329,6 @@ xfs_exchrange_mappings( * successfully but before locks are dropped. */ -/* Verify that we have security clearance to perform this operation. */ -static int -xfs_exchange_range_verify_area( - struct xfs_exchrange *fxr) -{ - int ret; - - ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, - true); - if (ret) - return ret; - - return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, - true); -} - /* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem. @@ -355,11 +339,13 @@ xfs_exchange_range_checks( unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); + loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2); + loff_t size2 = i_size_read(inode2); uint64_t allocmask = alloc_unit - 1; int64_t test_len; uint64_t blen; - loff_t size1, size2, tmp; + loff_t tmp; int error; /* Don't touch certain kinds of inodes */ @@ -368,24 +354,25 @@ xfs_exchange_range_checks( if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY; - size1 = i_size_read(inode1); - size2 = i_size_read(inode2); - /* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL; - /* - * If the caller said to exchange to EOF, we set the length of the - * request large enough to cover everything to the end of both files. - */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + /* + * If the caller said to exchange to EOF, we set the length of + * the request large enough to cover everything to the end of + * both files. + */ fxr->length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); - - error = xfs_exchange_range_verify_area(fxr); - if (error) - return error; + } else { + /* + * Otherwise we require both ranges to end within EOF. + */ + if (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2) + return -EINVAL; } /* @@ -401,15 +388,6 @@ xfs_exchange_range_checks( check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) return -EINVAL; - /* - * We require both ranges to end within EOF, unless we're exchanging - * to EOF. - */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && - (fxr->file1_offset + fxr->length > size1 || - fxr->file2_offset + fxr->length > size2)) - return -EINVAL; - /* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole @@ -747,6 +725,7 @@ xfs_exchange_range( { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); + loff_t check_len = fxr->length; int ret; BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & @@ -779,14 +758,18 @@ xfs_exchange_range( return -EBADF; /* - * If we're not exchanging to EOF, we can check the areas before - * stabilizing both files' i_size. + * If we're exchanging to EOF we can't calculate the length until taking + * the iolock. Pass a 0 length to remap_verify_area similar to the + * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well. */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { - ret = xfs_exchange_range_verify_area(fxr); - if (ret) - return ret; - } + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + check_len = 0; + ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); + if (ret) + return ret; + ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); + if (ret) + return ret; /* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) From fb95897b8c60653805aa09daec575ca30983f768 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Fri, 24 Jan 2025 11:22:28 +0800 Subject: [PATCH 4/6] xfs: Propagate errors from xfs_reflink_cancel_cow_range in xfs_dax_write_iomap_end In xfs_dax_write_iomap_end(), directly return the result of xfs_reflink_cancel_cow_range() when !written, ensuring proper error propagation and improving code robustness. Fixes: ea6c49b784f0 ("xfs: support CoW in fsdax mode") Cc: stable@vger.kernel.org # v6.0 Reviewed-by: Darrick J. Wong Signed-off-by: Wentao Liang Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_iomap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c..d61460309a78 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -976,10 +976,8 @@ xfs_dax_write_iomap_end( if (!xfs_is_cow_inode(ip)) return 0; - if (!written) { - xfs_reflink_cancel_cow_range(ip, pos, length, true); - return 0; - } + if (!written) + return xfs_reflink_cancel_cow_range(ip, pos, length, true); return xfs_reflink_end_cow(ip, pos, written); } From 26b63bee2f6e711c5a169997fd126fddcfb90848 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Fri, 24 Jan 2025 11:45:09 +0800 Subject: [PATCH 5/6] xfs: Add error handling for xfs_reflink_cancel_cow_range In xfs_inactive(), xfs_reflink_cancel_cow_range() is called without error handling, risking unnoticed failures and inconsistent behavior compared to other parts of the code. Fix this issue by adding an error handling for the xfs_reflink_cancel_cow_range(), improving code robustness. Fixes: 6231848c3aa5 ("xfs: check for cow blocks before trying to clear them") Cc: stable@vger.kernel.org # v4.17 Reviewed-by: Darrick J. Wong Signed-off-by: Wentao Liang Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c95fe1b1de4e..b1f9f156ec88 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1404,8 +1404,11 @@ xfs_inactive( goto out; /* Try to clean out the cow blocks if there are any. */ - if (xfs_inode_has_cow_data(ip)) - xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (xfs_inode_has_cow_data(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (error) + goto out; + } if (VFS_I(ip)->i_nlink != 0) { /* From a9ab28b3d21aec6d0f56fe722953e20ce470237b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jan 2025 06:22:58 +0100 Subject: [PATCH 6/6] xfs: remove xfs_buf_cache.bc_lock xfs_buf_cache.bc_lock serializes adding buffers to and removing them from the hashtable. But as the rhashtable code already uses fine grained internal locking for inserts and removals the extra protection isn't actually required. It also happens to fix a lock order inversion vs b_lock added by the recent lookup race fix. Fixes: ee10f6fcdb96 ("xfs: fix buffer lookup vs release race") Reported-by: Lai, Yi Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 31 +++++++++++++++++-------------- fs/xfs/xfs_buf.h | 1 - 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f1252ed8bd0a..ef207784876c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -41,8 +41,7 @@ struct kmem_cache *xfs_buf_cache; * * xfs_buf_rele: * b_lock - * pag_buf_lock - * lru_lock + * lru_lock * * xfs_buftarg_drain_rele * lru_lock @@ -220,14 +219,21 @@ _xfs_buf_alloc( */ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - spin_lock_init(&bp->b_lock); + /* + * A new buffer is held and locked by the owner. This ensures that the + * buffer is owned by the caller and racing RCU lookups right after + * inserting into the hash table are safe (and will have to wait for + * the unlock to do anything non-trivial). + */ bp->b_hold = 1; + sema_init(&bp->b_sema, 0); /* held, no waiters */ + + spin_lock_init(&bp->b_lock); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_li_list); - sema_init(&bp->b_sema, 0); /* held, no waiters */ bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; @@ -497,7 +503,6 @@ int xfs_buf_cache_init( struct xfs_buf_cache *bch) { - spin_lock_init(&bch->bc_lock); return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } @@ -647,17 +652,20 @@ xfs_buf_find_insert( if (error) goto out_free_buf; - spin_lock(&bch->bc_lock); + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + + rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { + rcu_read_unlock(); error = PTR_ERR(bp); - spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp && xfs_buf_try_hold(bp)) { /* found an existing buffer */ - spin_unlock(&bch->bc_lock); + rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -665,10 +673,8 @@ xfs_buf_find_insert( *bpp = bp; goto out_free_buf; } + rcu_read_unlock(); - /* The new buffer keeps the perag reference until it is freed. */ - new_bp->b_pag = pag; - spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; @@ -1085,7 +1091,6 @@ xfs_buf_rele_cached( } /* we are asked to drop the last reference */ - spin_lock(&bch->bc_lock); __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* @@ -1097,7 +1102,6 @@ xfs_buf_rele_cached( bp->b_state &= ~XFS_BSTATE_DISPOSE; else bp->b_hold--; - spin_unlock(&bch->bc_lock); } else { bp->b_hold--; /* @@ -1115,7 +1119,6 @@ xfs_buf_rele_cached( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, xfs_buf_hash_params); - spin_unlock(&bch->bc_lock); if (pag) xfs_perag_put(pag); freebuf = true; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7e73663c5d4a..3b4ed42e11c0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -80,7 +80,6 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { - spinlock_t bc_lock; struct rhashtable bc_hash; };