From 1805e6b2f49fbf63322a629a36019cbe2c6628e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Jan 2026 16:43:21 -0500 Subject: [PATCH 01/21] NFSv4/pnfs: If the server is down, retry the layout returns on reboot If a layout return is embedded in a CLOSE or DELEGRETURN rpc call, and the metadata server reboots, the expectation now is that the client should resend the layout return once the server comes back up. This patch changes the current behaviour of dropping the layouts on the floor, and instead queues them up for retrying. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 30 ++++++++++++++++++++---------- fs/nfs/pnfs.c | 22 +++++++++++++++++----- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 91bcf67bd743..768de9935ff1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9769,16 +9769,26 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; - if (task->tk_rpc_status == -ETIMEDOUT) { - lrp->rpc_status = -EAGAIN; - lrp->res.lrs_present = 0; - return; - } - /* - * Was there an RPC level error? Assume the call succeeded, - * and that we need to release the layout - */ - if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + if (task->tk_rpc_status < 0) { + switch (task->tk_rpc_status) { + case -EACCES: + case -EIO: + case -EKEYEXPIRED: + case -ERESTARTSYS: + case -EINTR: + lrp->rpc_status = 0; + break; + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + lrp->rpc_status = 0; + else + lrp->rpc_status = -EAGAIN; + break; + default: + lrp->rpc_status = -EAGAIN; + break; + } lrp->res.lrs_present = 0; return; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bc13d1e69449..e89e476070a1 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1698,11 +1698,23 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, /* If the call was not sent, let caller handle it */ if (!RPC_WAS_SENT(task)) return 0; - /* - * Otherwise, assume the call succeeded and - * that we need to release the layout - */ - *ret = 0; + switch (task->tk_rpc_status) { + default: + /* + * Defer the layoutreturn if it was due + * to the server being down. + */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + break; + case -EACCES: + case -EIO: + case -EKEYEXPIRED: + case -ERESTARTSYS: + case -EINTR: + /* Don't retry */ + *ret = 0; + break; + } (*respp)->lrs_present = 0; retval = 0; break; From 3a06bac55bf56290673ea67abe3d285f0ab3837a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 20 Feb 2026 16:42:18 -0500 Subject: [PATCH 02/21] NFS: improve "Server wrote zero bytes" error When a pnfs error occurs, the IO is retried against the MDS. However, the initial IO leads to the kernel logging "Serer wrote zero bytes" when in fact the MDS IO will not fail and thus the error misleads administrators that the system is experiencing issues. When pnfs IO fails which triggers pnfs_write_done_resent_to_mds() which would end up clearing nfs_pgio_header's pages structure (copying the content into a new one to do new RPC calls to the MDS). Thus, in nfs_writeback_result() when we have no pages to work with no need to try and also therefore skip logging the message about 0bytes. Fixes: 6c75dc0d498c ("NFS: merge _full and _partial write rpc_ops") Suggested-by: Trond Myklebust Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1ed4b3590b1a..f1f62787dd74 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1551,7 +1551,7 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_args *argp = &hdr->args; struct nfs_pgio_res *resp = &hdr->res; - if (resp->count < argp->count) { + if (resp->count < argp->count && !list_empty(&hdr->pages)) { static unsigned long complain; /* This a short write! */ From 16d99dce938ecbbc703843a31fb951acca46af27 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 24 Mar 2026 13:32:11 -0400 Subject: [PATCH 03/21] nfs: fix utimensat() for atime with delegated timestamps xfstest generic/221 is failing with delegated timestamps enabled. When the client holds a WRITE_ATTRS_DELEG delegation, and a userland process does a utimensat() for only the atime, the ctime is not properly updated. The problem is that the client tries to cache the atime update, but there is no mtime update, so the delegated attribute update never updates the ctime. Delegated timestamps don't have a mechanism to update the ctime in accordance with atime-only changes due to utimensat() and the like. Change the client to issue an RPC in this case, so that the ctime gets properly updated alongside the atime. Fixes: 40f45ab3814f ("NFS: Further fixes to attribute delegation a/mtime changes") Reported-by: Olga Kornievskaia Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4786343eeee0..3a5bba7e3c92 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -757,14 +757,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { - if (attr->ia_valid & ATTR_ATIME_SET) { - if (uid_eq(task_uid, owner_uid)) { - spin_lock(&inode->i_lock); - nfs_set_timestamps_to_ts(inode, attr); - spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); - } - } else { + if (!(attr->ia_valid & ATTR_ATIME_SET)) { nfs_update_delegated_atime(inode); attr->ia_valid &= ~ATTR_ATIME; } From 9c332d7f63401c3ff1765c9998531b3784f3f9a4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 24 Mar 2026 13:32:12 -0400 Subject: [PATCH 04/21] nfs: update inode ctime after removexattr operation xfstest generic/728 fails with delegated timestamps. The client does a removexattr and then a stat to test the ctime, which doesn't change. The stat() doesn't trigger a GETATTR because of the delegated timestamps, so it relies on the cached ctime, which is wrong. The setxattr compound has a trailing GETATTR, which ensures that its ctime gets updated. Follow the same strategy with removexattr. Fixes: 3e1f02123fba ("NFSv4.2: add client side XDR handling for extended attributes") Reported-by: Olga Kornievskaia Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 18 ++++++++++++++++-- fs/nfs/nfs42xdr.c | 10 ++++++++-- include/linux/nfs_xdr.h | 3 +++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 7b3ca68fb4bb..7e5c1172fc11 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1372,11 +1372,15 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f, static int _nfs42_proc_removexattr(struct inode *inode, const char *name) { struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS_BITMASK_SZ]; struct nfs42_removexattrargs args = { .fh = NFS_FH(inode), + .bitmask = bitmask, .xattr_name = name, }; - struct nfs42_removexattrres res; + struct nfs42_removexattrres res = { + .server = server, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR], .rpc_argp = &args, @@ -1385,12 +1389,22 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name) int ret; unsigned long timestamp = jiffies; + res.fattr = nfs_alloc_fattr(); + if (!res.fattr) + return -ENOMEM; + + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, + inode, NFS_INO_INVALID_CHANGE); + ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); trace_nfs4_removexattr(inode, name, ret); - if (!ret) + if (!ret) { nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); + ret = nfs_post_op_update_inode(inode, res.fattr); + } + kfree(res.fattr); return ret; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 5c7452ce6e8a..ec105c62f721 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -263,11 +263,13 @@ #define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_removexattr_maxsz) + encode_removexattr_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_removexattr_maxsz) + decode_removexattr_maxsz + \ + decode_getattr_maxsz) /* * These values specify the maximum amount of data that is not @@ -869,6 +871,7 @@ static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_removexattr(xdr, args->xattr_name, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -1818,6 +1821,9 @@ static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req, goto out; status = decode_removexattr(xdr, &res->cinfo); + if (status) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ff1f12aa73d2..fcbd21b5685f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1611,12 +1611,15 @@ struct nfs42_listxattrsres { struct nfs42_removexattrargs { struct nfs4_sequence_args seq_args; struct nfs_fh *fh; + const u32 *bitmask; const char *xattr_name; }; struct nfs42_removexattrres { struct nfs4_sequence_res seq_res; struct nfs4_change_info cinfo; + struct nfs_fattr *fattr; + const struct nfs_server *server; }; #endif /* CONFIG_NFS_V4_2 */ From 24297c7cd3f9389374bb13d1ca578c335d2866b9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:22 -0500 Subject: [PATCH 05/21] xprtrdma: Close sendctx get/put race that can block a transport rpcrdma_sendctx_get_locked() and rpcrdma_sendctx_put_locked() can race in a way that leaves XPRT_WRITE_SPACE set permanently, blocking all further sends on the transport: get_locked put_locked (Send completion) ---------- -------------------------- read rb_sc_tail -> ring full advance rb_sc_tail xprt_write_space(): test_bit(WRITE_SPACE) -> not set, return set_bit(WRITE_SPACE) return NULL (-EAGAIN) After the sender releases XPRT_LOCKED, the release path refuses to wake the next task because XPRT_WRITE_SPACE is set. The sender retries, finds XPRT_WRITE_SPACE still set, and sleeps on xprt_sending. No further Send completions arrive to clear the flag because no new Sends can be posted. With nconnect, the stalled transport's share of congestion credits are never returned, starving the remaining transports as well. Fixes: 05eb06d86685 ("xprtrdma: Fix occasional transport deadlock") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b51a162885bb..90fd83f2d846 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -708,6 +708,18 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt) */ xprt_wait_for_buffer_space(&r_xprt->rx_xprt); r_xprt->rx_stats.empty_sendctx_q++; + + /* Recheck: a Send completion between the ring-empty test + * and the set_bit could cause its xprt_write_space() to + * miss, leaving XPRT_WRITE_SPACE set with a non-full ring. + * The smp_mb__after_atomic() pairs with smp_store_release() + * in rpcrdma_sendctx_put_locked(). + */ + smp_mb__after_atomic(); + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); + if (next_head != READ_ONCE(buf->rb_sc_tail)) + xprt_write_space(&r_xprt->rx_xprt); + return NULL; } @@ -739,7 +751,10 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, } while (buf->rb_sc_ctxs[next_tail] != sc); - /* Paired with READ_ONCE */ + /* Paired with READ_ONCE in rpcrdma_sendctx_get_locked(): + * both the fast-path ring-full test and the post-set_bit + * recheck in the slow path depend on this store-release. + */ smp_store_release(&buf->rb_sc_tail, next_tail); xprt_write_space(&r_xprt->rx_xprt); From 100142093e22b3f7741ac88e94878bb3694e306f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:23 -0500 Subject: [PATCH 06/21] xprtrdma: Avoid 250 ms delay on backlog wakeup Commit a721035477fb ("SUNRPC/xprt: async tasks mustn't block waiting for memory") changed xprt_rdma_alloc_slot() to set tk_status to -ENOMEM so that call_reserveresult() would sleep HZ/4 before retrying. That rationale applies to xprt_dynamic_alloc_slot(), where an immediate retry under memory pressure wastes CPU, but not to the RDMA backlog path: a task woken from the backlog has a slot waiting for it, so the 250 ms rpc_delay adds latency without benefit. This also aligns the code with the existing kernel-doc for xprt_rdma_alloc_slot(), which already documented %-EAGAIN. Fixes: a721035477fb ("SUNRPC/xprt: async tasks mustn't block waiting for memory") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9a8ce5df83ca..ca079439f9cc 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -510,7 +510,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: - task->tk_status = -ENOMEM; + task->tk_status = -EAGAIN; xprt_add_backlog(xprt, task); } From 765bde47fe7f197dabeb12da76831f40d0b20377 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:24 -0500 Subject: [PATCH 07/21] xprtrdma: Close lost-wakeup race in xprt_rdma_alloc_slot xprt_rdma_alloc_slot() and xprt_rdma_free_slot() lack serialization between the buffer pool and the backlog queue. A buffer freed after rpcrdma_buffer_get() finds the pool empty but before rpc_sleep_on() places the task on the backlog is returned to the pool with no waiter to wake, leaving the task stuck on the backlog indefinitely. After joining the backlog, re-check the pool and route any recovered buffer through xprt_wake_up_backlog(), whose queue lock serializes with concurrent wakeups and avoids double-assignment of slots. Because xprt_rdma_free_slot() does not hold reserve_lock, the XPRT_CONGESTED double-check in xprt_throttle_congested() is ineffective: a task can join the backlog through that path after free_slot has already found it empty and cleared the bit. Avoid this by using xprt_add_backlog_noncongested(), which queues the task without setting XPRT_CONGESTED, so every allocation reaches xprt_rdma_alloc_slot() and its post-sleep re-check. Fixes: edb41e61a54e ("xprtrdma: Make rpc_rqst part of rpcrdma_req") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 16 ++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index f46d1fb8f71a..a82045804d34 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -404,6 +404,8 @@ struct rpc_xprt * xprt_alloc(struct net *net, size_t size, unsigned int max_req); void xprt_free(struct rpc_xprt *); void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, + struct rpc_task *task); bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); void xprt_cleanup_ids(void); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4fbb57a29704..48a3618cbb29 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1663,6 +1663,22 @@ void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_add_backlog); +/** + * xprt_add_backlog_noncongested - queue task on backlog + * @xprt: transport whose backlog queue receives the task + * @task: task to queue + * + * Like xprt_add_backlog, but does not set XPRT_CONGESTED. + * For transports whose free_slot path does not synchronize + * with xprt_throttle_congested via reserve_lock. + */ +void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, + struct rpc_task *task) +{ + rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); +} +EXPORT_SYMBOL_GPL(xprt_add_backlog_noncongested); + static bool __xprt_set_rq(struct rpc_task *task, void *data) { struct rpc_rqst *req = data; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ca079439f9cc..61706df5e485 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -511,7 +511,20 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) out_sleep: task->tk_status = -EAGAIN; - xprt_add_backlog(xprt, task); + xprt_add_backlog_noncongested(xprt, task); + /* A buffer freed between buffer_get and rpc_sleep_on + * goes back to the pool with no waiter to wake. + * Re-check after joining the backlog to close that gap. + */ + req = rpcrdma_buffer_get(&r_xprt->rx_buf); + if (req) { + struct rpc_rqst *rqst = &req->rl_slot; + + if (!xprt_wake_up_backlog(xprt, rqst)) { + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, req); + } + } } /** From 6f2e565fb3bd68636e4920223e599d70861f8ba6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:25 -0500 Subject: [PATCH 08/21] xprtrdma: Decouple frwr_wp_create from frwr_map frwr_wp_create is the only caller of frwr_map outside the encode path. It registers a single 4-byte write-pad region from a stack- local rpcrdma_mr_seg. Inlining the registration logic directly (sg_init_table + sg_set_page + ib_dma_map_sg + ib_map_mr_sg + IOVA mangle + reg_wr setup) eliminates the coupling that would otherwise complicate the removal of rpcrdma_mr_seg from frwr_map's interface. The inlined version adds a proper error-unwind ladder: on failure, the DMA mapping (if established) is released, ep->re_write_pad_mr is cleared, and the MR is returned to the transport free list. The old frwr_map-based code relied on rpcrdma_mrs_destroy at teardown to reclaim partially-initialized MRs. This is a one-time setup path; duplicating ~20 lines is a reasonable tradeoff for decoupling the write-pad registration from the data- path MR registration. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 57 +++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 31434aeb8e29..4331b0b65f4c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -669,9 +669,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ int frwr_wp_create(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_mr_seg seg; + struct ib_reg_wr *reg_wr; struct rpcrdma_mr *mr; + struct ib_mr *ibmr; + int dma_nents; + int ret; mr = rpcrdma_mr_get(r_xprt); if (!mr) @@ -679,11 +683,39 @@ int frwr_wp_create(struct rpcrdma_xprt *r_xprt) mr->mr_req = NULL; ep->re_write_pad_mr = mr; - seg.mr_len = XDR_UNIT; - seg.mr_page = virt_to_page(ep->re_write_pad); - seg.mr_offset = offset_in_page(ep->re_write_pad); - if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) - return -EIO; + sg_init_table(mr->mr_sg, 1); + sg_set_page(mr->mr_sg, virt_to_page(ep->re_write_pad), + XDR_UNIT, offset_in_page(ep->re_write_pad)); + + mr->mr_dir = DMA_FROM_DEVICE; + mr->mr_nents = 1; + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, + mr->mr_nents, mr->mr_dir); + if (!dma_nents) { + ret = -EIO; + goto out_mr; + } + mr->mr_device = ep->re_id->device; + + ibmr = mr->mr_ibmr; + if (ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, + PAGE_SIZE) != dma_nents) { + ret = -EIO; + goto out_unmap; + } + + /* IOVA is not tagged with an XID; the write-pad is not RPC-specific. */ + ib_update_fast_reg_key(ibmr, ib_inc_rkey(ibmr->rkey)); + + reg_wr = &mr->mr_regwr; + reg_wr->mr = ibmr; + reg_wr->key = ibmr->rkey; + reg_wr->access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; + + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; + trace_xprtrdma_mr_fastreg(mr); mr->mr_cqe.done = frwr_wc_fastreg; @@ -693,5 +725,16 @@ int frwr_wp_create(struct rpcrdma_xprt *r_xprt) mr->mr_regwr.wr.opcode = IB_WR_REG_MR; mr->mr_regwr.wr.send_flags = 0; - return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); + ret = ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); + if (!ret) + return 0; + +out_unmap: + frwr_mr_unmap(mr); +out_mr: + ep->re_write_pad_mr = NULL; + spin_lock(&buf->rb_lock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_lock); + return ret; } From 7a079ab57c4eeff241d9abfc1ec6477cb90a6206 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:26 -0500 Subject: [PATCH 09/21] xprtrdma: Replace rpcrdma_mr_seg with xdr_buf cursor The FRWR registration path converts data through three representations: xdr_buf -> rpcrdma_mr_seg[] -> scatterlist[] -> ib_map_mr_sg(). The rpcrdma_mr_seg intermediate is a relic of when multiple registration strategies existed (FMR, physical, FRWR). Only FRWR remains, so this indirection and the 6240-byte rl_segments[260] array embedded in each rpcrdma_req serve no purpose. Introduce struct rpcrdma_xdr_cursor to track position within an xdr_buf during iterative MR registration. Rewrite frwr_map to populate scatterlist entries directly from the xdr_buf regions (head kvec, page list, tail kvec). The boundary logic for non-SG_GAPS devices is simpler because the xdr_buf structure guarantees that page-region entries after the first start at offset 0, and that head/tail kvecs are separate regions that naturally break at MR boundaries. Fix a pre-existing bug in rpcrdma_encode_write_list where the write-pad statistics accumulator added mr->mr_length from the last data MR rather than the write-pad MR. The refactored code uses ep->re_write_pad_mr->mr_length. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 28 +++--- net/sunrpc/xprtrdma/frwr_ops.c | 117 ++++++++++++++++++----- net/sunrpc/xprtrdma/rpc_rdma.c | 163 +++++++++++--------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 42 +++++--- 4 files changed, 193 insertions(+), 157 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e6a72646c507..b79913048e1a 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -392,10 +392,10 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, const struct rpc_task *task, unsigned int pos, struct rpcrdma_mr *mr, - int nsegs + bool is_last ), - TP_ARGS(task, pos, mr, nsegs), + TP_ARGS(task, pos, mr, is_last), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -405,7 +405,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, __field(u32, handle) __field(u32, length) __field(u64, offset) - __field(int, nsegs) + __field(bool, is_last) ), TP_fast_assign( @@ -416,7 +416,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; - __entry->nsegs = nsegs; + __entry->is_last = is_last; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -424,7 +424,7 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, __entry->task_id, __entry->client_id, __entry->pos, __entry->length, (unsigned long long)__entry->offset, __entry->handle, - __entry->nents < __entry->nsegs ? "more" : "last" + __entry->is_last ? "last" : "more" ) ); @@ -434,18 +434,18 @@ DECLARE_EVENT_CLASS(xprtrdma_rdch_event, const struct rpc_task *task, \ unsigned int pos, \ struct rpcrdma_mr *mr, \ - int nsegs \ + bool is_last \ ), \ - TP_ARGS(task, pos, mr, nsegs)) + TP_ARGS(task, pos, mr, is_last)) DECLARE_EVENT_CLASS(xprtrdma_wrch_event, TP_PROTO( const struct rpc_task *task, struct rpcrdma_mr *mr, - int nsegs + bool is_last ), - TP_ARGS(task, mr, nsegs), + TP_ARGS(task, mr, is_last), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -454,7 +454,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, __field(u32, handle) __field(u32, length) __field(u64, offset) - __field(int, nsegs) + __field(bool, is_last) ), TP_fast_assign( @@ -464,7 +464,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; - __entry->nsegs = nsegs; + __entry->is_last = is_last; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -472,7 +472,7 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, __entry->task_id, __entry->client_id, __entry->length, (unsigned long long)__entry->offset, __entry->handle, - __entry->nents < __entry->nsegs ? "more" : "last" + __entry->is_last ? "last" : "more" ) ); @@ -481,9 +481,9 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, TP_PROTO( \ const struct rpc_task *task, \ struct rpcrdma_mr *mr, \ - int nsegs \ + bool is_last \ ), \ - TP_ARGS(task, mr, nsegs)) + TP_ARGS(task, mr, is_last)) TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL); TRACE_DEFINE_ENUM(DMA_TO_DEVICE); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 4331b0b65f4c..229057d35fb8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -268,10 +268,9 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) } /** - * frwr_map - Register a memory region + * frwr_map - Register a memory region from an xdr_buf cursor * @r_xprt: controlling transport - * @seg: memory region co-ordinates - * @nsegs: number of segments remaining + * @cur: cursor tracking position within the xdr_buf * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory * @mr: MR to fill in @@ -279,34 +278,104 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * - * Returns the next segment or a negative errno pointer. - * On success, @mr is filled in. + * Returns 0 on success (cursor advanced past consumed data, + * @mr populated) or a negative errno on failure. */ -struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr *mr) +int frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_xdr_cursor *cur, + bool writing, __be32 xid, + struct rpcrdma_mr *mr) { struct rpcrdma_ep *ep = r_xprt->rx_ep; + const struct xdr_buf *xdrbuf = cur->xc_buf; + bool sg_gaps = ep->re_mrtype == IB_MR_TYPE_SG_GAPS; + unsigned int max_depth = ep->re_max_fr_depth; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ep->re_max_fr_depth) - nsegs = ep->re_max_fr_depth; - for (i = 0; i < nsegs;) { - sg_set_page(&mr->mr_sg[i], seg->mr_page, - seg->mr_len, seg->mr_offset); + i = 0; - ++seg; - ++i; - if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) - continue; - if ((i < nsegs && seg->mr_offset) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; + /* Head kvec */ + if (!(cur->xc_flags & XC_HEAD_DONE)) { + const struct kvec *head = &xdrbuf->head[0]; + + sg_set_page(&mr->mr_sg[i], + virt_to_page(head->iov_base), + head->iov_len, + offset_in_page(head->iov_base)); + cur->xc_flags |= XC_HEAD_DONE; + i++; + /* Without sg-gap support, each non-contiguous region + * must be registered as a separate MR. Returning + * here after the head kvec causes the caller to + * invoke frwr_map() again for the page list and + * tail. + */ + if (!sg_gaps) + goto finish; } + + /* Page list */ + if (!(cur->xc_flags & XC_PAGES_DONE) && xdrbuf->page_len) { + unsigned int page_base, remaining; + struct page **ppages; + + remaining = xdrbuf->page_len - cur->xc_page_offset; + page_base = offset_in_page(xdrbuf->page_base + + cur->xc_page_offset); + ppages = xdrbuf->pages + + ((xdrbuf->page_base + cur->xc_page_offset) + >> PAGE_SHIFT); + + while (remaining > 0 && i < max_depth) { + unsigned int len; + + len = min_t(unsigned int, + PAGE_SIZE - page_base, remaining); + sg_set_page(&mr->mr_sg[i], *ppages, + len, page_base); + cur->xc_page_offset += len; + i++; + ppages++; + remaining -= len; + + if (!sg_gaps && remaining > 0 && + offset_in_page(page_base + len)) + goto finish; + page_base = 0; + } + if (remaining == 0) + cur->xc_flags |= XC_PAGES_DONE; + } else if (!(cur->xc_flags & XC_PAGES_DONE)) { + cur->xc_flags |= XC_PAGES_DONE; + } + + /* Tail kvec */ + if (!(cur->xc_flags & XC_TAIL_DONE) && xdrbuf->tail[0].iov_len && + i < max_depth) { + const struct kvec *tail = &xdrbuf->tail[0]; + + if (!sg_gaps && i > 0) { + struct scatterlist *prev = &mr->mr_sg[i - 1]; + + if (offset_in_page(prev->offset + prev->length) || + offset_in_page(tail->iov_base)) + goto finish; + } + sg_set_page(&mr->mr_sg[i], + virt_to_page(tail->iov_base), + tail->iov_len, + offset_in_page(tail->iov_base)); + cur->xc_flags |= XC_TAIL_DONE; + i++; + } else if (!(cur->xc_flags & XC_TAIL_DONE) && + !xdrbuf->tail[0].iov_len) { + cur->xc_flags |= XC_TAIL_DONE; + } + +finish: mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; @@ -338,15 +407,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - return seg; + return 0; out_dmamap_err: trace_xprtrdma_frwr_sgerr(mr, i); - return ERR_PTR(-EIO); + return -EIO; out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - return ERR_PTR(-EIO); + return -EIO; } /** diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 3aac1456e23e..a77e7e48aab2 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -200,67 +200,30 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) return 0; } -/* Convert @vec to a single SGL element. - * - * Returns pointer to next available SGE, and bumps the total number - * of SGEs consumed. - */ -static struct rpcrdma_mr_seg * -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, - unsigned int *n) +static void +rpcrdma_xdr_cursor_init(struct rpcrdma_xdr_cursor *cur, + const struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type) { - seg->mr_page = virt_to_page(vec->iov_base); - seg->mr_offset = offset_in_page(vec->iov_base); - seg->mr_len = vec->iov_len; - ++seg; - ++(*n); - return seg; + cur->xc_buf = xdrbuf; + cur->xc_page_offset = 0; + cur->xc_flags = 0; + + if (pos != 0) + cur->xc_flags |= XC_HEAD_DONE; + if (!xdrbuf->page_len) + cur->xc_flags |= XC_PAGES_DONE; + if (type == rpcrdma_readch || type == rpcrdma_writech || + !xdrbuf->tail[0].iov_len) + cur->xc_flags |= XC_TAIL_DONE; } -/* Convert @xdrbuf into SGEs no larger than a page each. As they - * are registered, these SGEs are then coalesced into RDMA segments - * when the selected memreg mode supports it. - * - * Returns positive number of SGEs consumed, or a negative errno. - */ - -static int -rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, - unsigned int pos, enum rpcrdma_chunktype type, - struct rpcrdma_mr_seg *seg) +static bool +rpcrdma_xdr_cursor_done(const struct rpcrdma_xdr_cursor *cur) { - unsigned long page_base; - unsigned int len, n; - struct page **ppages; - - n = 0; - if (pos == 0) - seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); - - len = xdrbuf->page_len; - ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); - page_base = offset_in_page(xdrbuf->page_base); - while (len) { - seg->mr_page = *ppages; - seg->mr_offset = page_base; - seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); - len -= seg->mr_len; - ++ppages; - ++seg; - ++n; - page_base = 0; - } - - if (type == rpcrdma_readch || type == rpcrdma_writech) - goto out; - - if (xdrbuf->tail[0].iov_len) - rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); - -out: - if (unlikely(n > RPCRDMA_MAX_SEGS)) - return -EIO; - return n; + return (cur->xc_flags & (XC_HEAD_DONE | XC_PAGES_DONE | + XC_TAIL_DONE)) == + (XC_HEAD_DONE | XC_PAGES_DONE | XC_TAIL_DONE); } static int @@ -292,11 +255,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } -static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, - struct rpcrdma_mr **mr) +static int rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_xdr_cursor *cur, + bool writing, struct rpcrdma_mr **mr) { *mr = rpcrdma_mr_pop(&req->rl_free_mrs); if (!*mr) { @@ -307,13 +269,13 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, } rpcrdma_mr_push(*mr, &req->rl_registered); - return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + return frwr_map(r_xprt, cur, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } /* Register and XDR encode the Read list. Supports encoding a list of read @@ -336,10 +298,10 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; unsigned int pos; - int nsegs; + int ret; if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) goto done; @@ -347,24 +309,20 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, - rtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_snd_buf, pos, rtype); do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, false, &mr); + if (ret) + return ret; if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.read_chunk_count++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); done: if (xdr_stream_encode_item_absent(xdr) < 0) @@ -394,20 +352,16 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; - int nsegs, nchunks; + int nchunks, ret; __be32 *segcount; if (wtype != rpcrdma_writech) goto done; - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, - rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, + rqst->rq_rcv_buf.head[0].iov_len, wtype); if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; @@ -418,30 +372,30 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); + if (ret) + return ret; if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_write(rqst->rq_task, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) return -EMSGSIZE; trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, - nsegs); + true); r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += mr->mr_length; + r_xprt->rx_stats.total_rdma_request += + ep->re_write_pad_mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; } /* Update count of segments in this Write chunk */ @@ -471,9 +425,9 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; - struct rpcrdma_mr_seg *seg; + struct rpcrdma_xdr_cursor cur; struct rpcrdma_mr *mr; - int nsegs, nchunks; + int nchunks, ret; __be32 *segcount; if (wtype != rpcrdma_replych) { @@ -482,10 +436,7 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return 0; } - seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); - if (nsegs < 0) - return nsegs; + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 0, wtype); if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; @@ -496,19 +447,19 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); - if (IS_ERR(seg)) - return PTR_ERR(seg); + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); + if (ret) + return ret; if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, + rpcrdma_xdr_cursor_done(&cur)); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; - nsegs -= mr->mr_nents; - } while (nsegs); + } while (!rpcrdma_xdr_cursor_done(&cur)); /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8147d2b41494..37bba72065e8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -283,19 +283,36 @@ struct rpcrdma_mr { * registered or invalidated. Must handle a Reply chunk: */ enum { - RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_IOV_SEGS = 3, /* head, page-boundary, tail */ RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + RPCRDMA_MAX_IOV_SEGS, }; -/* Arguments for DMA mapping and registration */ -struct rpcrdma_mr_seg { - u32 mr_len; /* length of segment */ - struct page *mr_page; /* underlying struct page */ - u64 mr_offset; /* IN: page offset, OUT: iova */ +/** + * struct rpcrdma_xdr_cursor - tracks position within an xdr_buf + * for iterative MR registration + * @xc_buf: the xdr_buf being iterated + * @xc_page_offset: byte offset into the page region consumed so far + * @xc_flags: combination of XC_* bits + * + * Each XC_*_DONE flag indicates that this region has no + * remaining MR registration work. That condition holds both when the region + * has already been registered by a prior frwr_map() call and + * when the region is excluded from this chunk type (pre-set + * at init time by rpcrdma_xdr_cursor_init()). frwr_map() + * treats the two cases identically: skip the region. + */ +struct rpcrdma_xdr_cursor { + const struct xdr_buf *xc_buf; + unsigned int xc_page_offset; + unsigned int xc_flags; }; +#define XC_HEAD_DONE BIT(0) +#define XC_PAGES_DONE BIT(1) +#define XC_TAIL_DONE BIT(2) + /* The Send SGE array is provisioned to send a maximum size * inline request: * - RPC-over-RDMA header @@ -330,7 +347,6 @@ struct rpcrdma_req { struct list_head rl_free_mrs; struct list_head rl_registered; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -450,8 +466,8 @@ rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) } /* Setting this to 0 ensures interoperability with early servers. - * Setting this to 1 enhances certain unaligned read/write performance. - * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ + * Setting this to 1 enhances unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c */ extern int xprt_rdma_pad_optimize; /* This setting controls the hunt for a supported memory @@ -535,10 +551,10 @@ void frwr_reset(struct rpcrdma_req *req); int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_mr_release(struct rpcrdma_mr *mr); -struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr *mr); +int frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_xdr_cursor *cur, + bool writing, __be32 xid, + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); From 93b4791adb1017b2b079b4a453e7159e101a7e55 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:27 -0500 Subject: [PATCH 10/21] xprtrdma: Scale receive batch size with credit window The fixed RPCRDMA_MAX_RECV_BATCH of 7 results in frequent small ib_post_recv batches during high-rate workloads. With a 128-slot credit window, receives are reposted every 7th completion, each batch incurring atomic serialization and a doorbell write. Replace the fixed batch constant with a per-endpoint value scaled to 25% of the negotiated credit window. For a typical 128-credit connection this raises the batch from 7 to 32, reducing doorbell frequency by roughly 4x and amortizing the per-batch atomic and MMIO costs over a larger group of receive WRs. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 229057d35fb8..7f79a0a2601e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -244,9 +244,10 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) } ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_recv_batch = ep->re_max_requests >> 2; ep->re_attr.cap.max_recv_wr = ep->re_max_requests; ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; + ep->re_attr.cap.max_recv_wr += ep->re_recv_batch; ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ ep->re_max_rdma_segs = diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 90fd83f2d846..aecf9c0a153f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1374,7 +1374,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; - needed += RPCRDMA_MAX_RECV_BATCH; + needed += ep->re_recv_batch; if (atomic_inc_return(&ep->re_receiving) > 1) goto out_dec; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 37bba72065e8..f53a77472724 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -96,6 +96,7 @@ struct rpcrdma_ep { struct rpcrdma_notification re_rn; int re_receive_count; unsigned int re_max_requests; /* depends on device */ + unsigned int re_recv_batch; unsigned int re_inline_send; /* negotiated */ unsigned int re_inline_recv; /* negotiated */ From 704f3f640f72db4d44ec5ce3db8d4e150c974bc7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 6 Mar 2026 16:56:28 -0500 Subject: [PATCH 11/21] xprtrdma: Post receive buffers after RPC completion rpcrdma_post_recvs() runs in CQ poll context and its cost falls on the latency-critical path between polling a Receive completion and waking the RPC consumer. Every cycle spent refilling the Receive Queue delays delivery of the reply to the NFS layer. Move the rpcrdma_post_recvs() call in rpcrdma_reply_handler() to after the RPC has been decoded and completed. The larger batch size from the preceding patch provides sufficient Receive Queue headroom to absorb the brief delay before buffers are replenished. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a77e7e48aab2..0e0f21974710 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1422,7 +1422,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; - rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); @@ -1441,15 +1440,20 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) /* LocalInv completion will complete the RPC */ else kref_put(&req->rl_kref, rpcrdma_reply_done); - return; -out_badversion: - trace_xprtrdma_reply_vers_err(rep); - goto out; +out_post: + rpcrdma_post_recvs(r_xprt, + credits + (buf->rb_bc_srv_max_requests << 1)); + return; out_norqst: spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst_err(rep); + rpcrdma_rep_put(buf, rep); + goto out_post; + +out_badversion: + trace_xprtrdma_reply_vers_err(rep); goto out; out_shortreply: From b0ed12538fdfeb39c844eba3fa4c269ddb4ebca7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2026 08:03:06 +0100 Subject: [PATCH 12/21] NFS/blocklayout: print each device used for SCSI layouts We already print device uses for block layouts, do the same for SCSI layouts as that greatly helps understanding the operation of the client. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index cc6327d97a91..bb35f88501ce 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -370,11 +370,14 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, - NULL, NULL); + bdev_file = bdev_file_open_by_path(devname, + BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(bdev_file)) { dprintk("failed to open device %s (%ld)\n", devname, PTR_ERR(bdev_file)); + } else { + pr_info("pNFS: using block device %s\n", + file_bdev(bdev_file)->bd_disk->disk_name); } kfree(devname); From 94545ffc0ae8ae6ab6590e9d7fed4da8123060cb Mon Sep 17 00:00:00 2001 From: Jenny Guanni Qu Date: Fri, 13 Mar 2026 22:42:07 +0000 Subject: [PATCH 13/21] pnfs/flexfiles: validate ds_versions_cnt is non-zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nfs4_ff_alloc_deviceid_node() reads version_count from XDR without checking it is non-zero. When a malicious NFS server sends a pNFS LAYOUTGET response with version_count=0, kcalloc(0, ...) returns ZERO_SIZE_PTR (0x10). The subsequent ds_versions[0] access in nfs4_ff_layout_ds_version() and other callers dereferences this invalid pointer, causing an out-of-bounds read. Add a check for version_count == 0 after parsing it from XDR, before the allocation. The OOB read was confirmed with KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] from accessing ZERO_SIZE_PTR. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Reported-by: Klaudia Kloc Reported-by: Dawid Moczadło Tested-by: Jenny Guanni Qu Signed-off-by: Jenny Guanni Qu Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index c40395ae0814..1109462a9699 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -97,6 +97,11 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, if (unlikely(!p)) goto out_err_drain_dsaddrs; version_count = be32_to_cpup(p); + + if (version_count == 0) { + ret = -EINVAL; + goto out_err_drain_dsaddrs; + } dprintk("%s: version count %d\n", __func__, version_count); ds_versions = kzalloc_objs(struct nfs4_ff_ds_version, version_count, From 4fa7ab8d292b1d4271fad397d98ea440e474cd7f Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 2 Apr 2026 19:12:36 -0400 Subject: [PATCH 14/21] NFS: fix RENAME attr in presence of directory delegations Since commit 6f9bda2337f8 ("NFS: Fix directory delegation verifier checks") xfstest generic/309 is failing because after the rename (mv) operation, client's mtime/ctime is the same. Update the delegated mtime when directory delegations are present in rename. Fixes: 6f9bda2337f8 ("NFS: Fix directory delegation verifier checks") Signed-off-by: Olga Kornievskaia Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 ++- fs/nfs/nfs4proc.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3a5bba7e3c92..43a0543364b8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -692,7 +692,8 @@ void nfs_update_delegated_atime(struct inode *inode) void nfs_update_delegated_mtime_locked(struct inode *inode) { - if (nfs_have_delegated_mtime(inode)) + if (nfs_have_delegated_mtime(inode) || + nfs_have_directory_delegation(inode)) nfs_update_mtime(inode); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 768de9935ff1..dd800403a7ce 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5052,6 +5052,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, res->new_fattr->time_start, NFS_INO_INVALID_NLINK | NFS_INO_INVALID_DATA); + nfs_update_delegated_mtime(new_dir); } else nfs4_update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start, From 515af10044f1c0d6f4356fcfb313465f02f484e9 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 10 Apr 2026 12:48:05 -0400 Subject: [PATCH 15/21] NFSv4: retry GETATTR if GET_DIR_DELEGATION failed Currently, getting a directory delegation is opportinistic and gets added to an existing GETATTR that's trying to retrieve some needed attributes. However, GET_DIRDELEGATION can fail and that currently causes a GETATTR to fail and an error is propagated to the user. Instead, the original GETATTR should be retried without requesting a directory delegation. Also, now chosing to clear asking for the direct delegation for this specific inode. Fixes: 156b09482933 ("NFS: Request a directory delegation on ACCESS, CREATE, and UNLINK") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dd800403a7ce..c2078545242e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4469,6 +4469,13 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, case -ENOTSUPP: case -EOPNOTSUPP: server->caps &= ~NFS_CAP_DIR_DELEG; + break; + case -NFS4ERR_INVAL: + case -NFS4ERR_IO: + case -NFS4ERR_DIRDELEG_UNAVAIL: + case -NFS4ERR_NOTDIR: + clear_bit(NFS_INO_REQ_DIR_DELEG, &(NFS_I(inode)->flags)); + status = -EAGAIN; } } @@ -4490,6 +4497,7 @@ int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, default: err = nfs4_handle_exception(server, err, &exception); break; + case -EAGAIN: case -ENOTSUPP: case -EOPNOTSUPP: exception.retry = true; From 8c787b286f39c7584440b97b92f87cbe934c13ff Mon Sep 17 00:00:00 2001 From: Tushar Sariya Date: Sat, 4 Apr 2026 11:58:03 -0230 Subject: [PATCH 16/21] NFSv4.1: Apply session size limits on clone path nfs4_clone_server() builds a child nfs_server for same-server automounted submounts but never calls nfs4_session_limit_rwsize() or nfs4_session_limit_xasize() after nfs_clone_server(). This means the child mount can end up with rsize/wsize values that exceed the negotiated session channel limits, causing NFS4ERR_REQ_TOO_BIG and EIO on servers that enforce tight max_request_size budgets. Top-level mounts go through nfs4_server_common_setup() which calls these limiters after nfs_probe_server(). Apply the same clamping on the clone path for consistency. Fixes: 2b092175f5e3 ("NFS: Fix inheritance of the block sizes when automounting") Cc: stable@vger.kernel.org Signed-off-by: Tushar Sariya Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 ++ fs/nfs/nfs4client.c | 4 ++-- fs/nfs/nfs4proc.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 63e09dfc27a8..0338603e9674 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -253,6 +253,8 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); +extern void nfs4_session_limit_rwsize(struct nfs_server *server); +extern void nfs4_session_limit_xasize(struct nfs_server *server); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index c211639949c2..71c271a1700a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -855,7 +855,7 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client); * Limit the mount rsize, wsize and dtsize using negotiated fore * channel attributes. */ -static void nfs4_session_limit_rwsize(struct nfs_server *server) +void nfs4_session_limit_rwsize(struct nfs_server *server) { struct nfs4_session *sess; u32 server_resp_sz; @@ -878,7 +878,7 @@ static void nfs4_session_limit_rwsize(struct nfs_server *server) /* * Limit xattr sizes using the channel attributes. */ -static void nfs4_session_limit_xasize(struct nfs_server *server) +void nfs4_session_limit_xasize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_2 struct nfs4_session *sess; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c2078545242e..7225b4cfa6c2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10637,6 +10637,9 @@ static struct nfs_server *nfs4_clone_server(struct nfs_server *source, if (IS_ERR(server)) return server; + nfs4_session_limit_rwsize(server); + nfs4_session_limit_xasize(server); + error = nfs4_delegation_hash_alloc(server); if (error) { nfs_free_server(server); From 43ea7036ee50b5368b1c361e8a3591aa0f1455d9 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 5 Apr 2026 12:32:14 +0200 Subject: [PATCH 17/21] nfs: use memcpy_and_pad in decode_fh Use memcpy_and_pad() instead of memcpy() followed by memset() to simplify decode_fh(). Signed-off-by: Thorsten Blum Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 176873f45677..4382baddc9ee 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -96,8 +96,7 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) p = xdr_inline_decode(xdr, fh->size); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(&fh->data[0], p, fh->size); - memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size); + memcpy_and_pad(fh->data, sizeof(fh->data), p, fh->size, 0); return 0; } From 5d3869a41f3608101c00ff9c9c7c2364c555fa65 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 13 Apr 2026 18:24:23 -0400 Subject: [PATCH 18/21] NFS: fix writeback in presence of errors After running xfstest generic/751, in certain conditions, can have a writeback IO stuck while experiencing one of the two patterns. Pattern#1: writeback IO experiences ENOSPC on an offset smaller than the filesize. Example, write offset=0 len=4096 how=unstable OK write offset=8192 len=4096 how=unstable OK write offset=12288 len=4096 how=unstable ENOSPC write offset=4096 len=4096 how=unstable ENOSPC client sends a commit and receives a verifier which is different from the last successful write. It marks pages dirty and writeback retries. But it again send writes unstable and gets into the same pattern, running into the ENOSPC error and sending a commit because writes were sent at unstable. Pattern#2: an unstable write followed by a short write and ENOSPC. write offset=0 len=4096 how=unstable OK write offset=4096 len=4096 how=unstable returns OK but count=100 write offset=4197 len=3996 how=stable returns ENOSPC client send a commit and receives a verifier different from the last unstable write. The same behaviour is retried in a loop. Instead, this patch proposes to identify those conditions and mark requests to be done synchronously instead. Previous solution tried to mark it in the nfs_page, however that's not persistent thus instead mark it in the nfs_open_context. Furthermore, the same problem occurs during localio code path so recognize that IO needs to be done sync in that case as well. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 15 ++++++++++++++- fs/nfs/pagelist.c | 3 +++ fs/nfs/write.c | 9 +++++++++ include/linux/nfs_fs.h | 1 + 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 4c7d16a99ed6..e55c5977fcc3 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -865,6 +865,8 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { + size_t icount; + if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; /* Only use AIO completion if DIO-aligned segment is last */ @@ -881,8 +883,16 @@ static void nfs_local_call_write(struct work_struct *work) if (status == -EIOCBQUEUED) continue; /* Break on completion, errors, or short writes */ + icount = iov_iter_count(&iocb->iters[i]); if (nfs_local_pgio_done(iocb, status) || status < 0 || - (size_t)status < iov_iter_count(&iocb->iters[i])) { + (size_t)status < icount) { + if ((size_t)status < icount) { + struct nfs_lock_context *ctx = + iocb->hdr->req->wb_lock_context; + + set_bit(NFS_CONTEXT_WRITE_SYNC, + &ctx->open_context->flags); + } nfs_local_write_iocb_done(iocb); break; } @@ -901,6 +911,9 @@ static void nfs_local_do_write(struct nfs_local_kiocb *iocb, __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); + if (test_bit(NFS_CONTEXT_WRITE_SYNC, + &hdr->req->wb_lock_context->open_context->flags)) + hdr->args.stable = NFS_FILE_SYNC; switch (hdr->args.stable) { default: break; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index a9373de891c9..4a87b2fdb2e6 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1186,6 +1186,9 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_lock(req); + if (test_bit(NFS_CONTEXT_WRITE_SYNC, + &req->wb_lock_context->open_context->flags)) + desc->pg_ioflags |= FLUSH_STABLE; subreq = req; subreq_size = subreq->wb_bytes; for(;;) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f1f62787dd74..f224b73fa30e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -927,9 +927,13 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto remove_req; } if (nfs_write_need_commit(hdr)) { + struct nfs_open_context *ctx = + hdr->req->wb_lock_context->open_context; + /* Reset wb_nio, since the write was successful. */ req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); + clear_bit(NFS_CONTEXT_WRITE_SYNC, &ctx->flags); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->ds_commit_idx); goto next; @@ -1553,7 +1557,10 @@ static void nfs_writeback_result(struct rpc_task *task, if (resp->count < argp->count && !list_empty(&hdr->pages)) { static unsigned long complain; + struct nfs_open_context *ctx = + hdr->req->wb_lock_context->open_context; + set_bit(NFS_CONTEXT_WRITE_SYNC, &ctx->flags); /* This a short write! */ nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE); @@ -1837,6 +1844,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_WRITE_SYNC, + &req->wb_lock_context->open_context->flags); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: nfs_unlock_and_release_request(req); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8dd79a3f3d66..4623262da3c0 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -109,6 +109,7 @@ struct nfs_open_context { #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) +#define NFS_CONTEXT_WRITE_SYNC (5) struct nfs4_threshold *mdsthreshold; struct list_head list; From 6e7daa3dad299080a9429522a98ac1ae1116ecc3 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 17 Apr 2026 16:35:43 -0400 Subject: [PATCH 19/21] NFSv4.2: fix CLONE/COPY attrs in presence of delegated attributes xfstest generic/407 is failing in 2 ways. It detects that after doing a clone the client does not update it's mtime and it's ctime. CLONE always sends a GETATTR operation and then calls nfs_post_op_update_inode() based on the returned attributes. Because of the delegated attributes the client ignores updating the mtime. Then also, when delegated attributes are present, for the change_attr the server replies with the same values as what the client cached before and thus the generic/407 would flag that. Instead, make sure we invalidate the blocks attr. By adding updating delegated attributes in nfs42_copy_dest_done() both COPY and CLONE would update mtime appropriately. Fixes: e12912d94137 ("NFSv4: Add support for delegated atime and mtime attributes") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 7e5c1172fc11..7602ede6f75f 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -401,6 +401,7 @@ static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, NFS_INO_INVALID_MTIME | NFS_INO_INVALID_BLOCKS); spin_unlock(&inode->i_lock); + nfs_update_delegated_mtime(inode); } static ssize_t _nfs42_proc_copy(struct file *src, From e8a44ae87b553b0851a20bebf3d2634a45c5e316 Mon Sep 17 00:00:00 2001 From: Sean Chang Date: Mon, 20 Apr 2026 00:31:37 +0800 Subject: [PATCH 20/21] NFS: remove redundant __private attribute from nfs_page_class The nfs_page_class tracepoint uses a pointer for the 'req' field marked with the __private attribute. This causes Sparse to complain about dereferencing a private pointer within the trace ring buffer context, specifically during the TP_fast_assign() operation. This fixes a Sparse warning introduced in commit b6ef079fd984 ("nfs: more in-depth tracing of writepage events") by removing the redundant __private attribute from the 'req' field. Reviewed-by: Benjamin Coddington Signed-off-by: Sean Chang Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 9f9ce4a565ea..ff467959f733 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1496,7 +1496,7 @@ DECLARE_EVENT_CLASS(nfs_page_class, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __field(const struct nfs_page *__private, req) + __field(const struct nfs_page *, req) __field(loff_t, offset) __field(unsigned int, count) __field(unsigned long, flags) From e6614b88d59d110ee1a80ed0826e34f24dd35c96 Mon Sep 17 00:00:00 2001 From: Sean Chang Date: Mon, 20 Apr 2026 00:31:38 +0800 Subject: [PATCH 21/21] NFS: Fix RCU dereference of cl_xprt in nfs_compare_super_address The cl_xprt pointer in struct rpc_clnt is marked as __rcu. Accessing it directly in nfs_compare_super_address() is unsafe and triggers Sparse warnings. Fix this by using rcu_dereference() within an RCU read-side critical section to retrieve the transport pointer. This addresses the sparse warning and ensures atomic access to the pointer, as the transport can be updated via transport switching even while the superblock remains active under sb_lock. Fixes: 7e3fcf61abde ("nfs: don't share mounts between network namespaces") Signed-off-by: Sean Chang Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 7a318581f85b..4cd420b14ce3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1166,12 +1166,18 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc) static int nfs_compare_super_address(struct nfs_server *server1, struct nfs_server *server2) { + struct rpc_xprt *xprt1, *xprt2; struct sockaddr *sap1, *sap2; - struct rpc_xprt *xprt1 = server1->client->cl_xprt; - struct rpc_xprt *xprt2 = server2->client->cl_xprt; + + rcu_read_lock(); + + xprt1 = rcu_dereference(server1->client->cl_xprt); + xprt2 = rcu_dereference(server2->client->cl_xprt); if (!net_eq(xprt1->xprt_net, xprt2->xprt_net)) - return 0; + goto out_unlock; + + rcu_read_unlock(); sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; @@ -1203,6 +1209,10 @@ static int nfs_compare_super_address(struct nfs_server *server1, } return 1; + +out_unlock: + rcu_read_unlock(); + return 0; } static int nfs_compare_userns(const struct nfs_server *old,