From 76b48a70b16b4036814964b039cde413e0164416 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 6 Feb 2026 00:08:36 -0500 Subject: [PATCH 01/24] IB/hfi1: Fix potential use-after-free in PIO and SDMA map teardown The current teardown logic for dd->pio_map and dd->sdma_map frees the structures while they might still be accessed by RCU readers. Although the pointer is nulled under a spinlock, the memory is reclaimed before waiting for the grace period to end. This patch fixes the sequence by: 1. Extracting the pointer under the lock. 2. Clearing the RCU-protected pointer. 3. Waiting for readers to finish with synchronize_rcu(). 4. Finally freeing the memory. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Link: https://patch.msgid.link/r/20260206050836.5890-1-lirongqing@baidu.com Signed-off-by: Li RongQing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/pio.c | 5 ++++- drivers/infiniband/hw/hfi1/sdma.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 51afaac88c72..9121d83bf88a 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1942,13 +1942,16 @@ int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts) void free_pio_map(struct hfi1_devdata *dd) { + struct pio_vl_map *map; + /* Free PIO map if allocated */ if (rcu_access_pointer(dd->pio_map)) { spin_lock_irq(&dd->pio_map_lock); - pio_map_free(rcu_access_pointer(dd->pio_map)); + map = rcu_access_pointer(dd->pio_map); RCU_INIT_POINTER(dd->pio_map, NULL); spin_unlock_irq(&dd->pio_map_lock); synchronize_rcu(); + pio_map_free(map); } kfree(dd->kernel_send_context); dd->kernel_send_context = NULL; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index e5f442938177..cfd9dd0f7e81 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1255,6 +1255,7 @@ void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) { size_t i; struct sdma_engine *sde; + struct sdma_vl_map *map; if (dd->sdma_pad_dma) { dma_free_coherent(&dd->pcidev->dev, SDMA_PAD, @@ -1291,10 +1292,11 @@ void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) } if (rcu_access_pointer(dd->sdma_map)) { spin_lock_irq(&dd->sde_map_lock); - sdma_map_free(rcu_access_pointer(dd->sdma_map)); + map = rcu_access_pointer(dd->sdma_map); RCU_INIT_POINTER(dd->sdma_map, NULL); spin_unlock_irq(&dd->sde_map_lock); synchronize_rcu(); + sdma_map_free(map); } kfree(dd->per_sdma); dd->per_sdma = NULL; From 4c6f86d85d03cdb33addce86aa69aa795ca6c47a Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 14 Apr 2026 07:15:55 -0400 Subject: [PATCH 02/24] RDMA/rxe: Reject unknown opcodes before ICRC processing Even after applying commit 7244491dab34 ("RDMA/rxe: Validate pad and ICRC before payload_size() in rxe_rcv"), a single unauthenticated UDP packet can still trigger panic. That patch handled payload_size() underflow only for valid opcodes with short packets, not for packets carrying an unknown opcode. The unknown-opcode OOB read described below predates that commit and reaches back to the initial Soft RoCE driver. The check added there reads pkt->paylen < header_size(pkt) + bth_pad(pkt) + RXE_ICRC_SIZE where header_size(pkt) expands to rxe_opcode[pkt->opcode].length. The rxe_opcode[] array has 256 entries but is only populated for defined IB opcodes; any other entry (for example opcode 0xff) is zero-initialized, so length == 0 and the check degenerates to pkt->paylen < 0 + bth_pad(pkt) + RXE_ICRC_SIZE which does not constrain pkt->paylen enough. rxe_icrc_hdr() then computes rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES which underflows when length == 0 and passes a huge value to rxe_crc32(), causing an out-of-bounds read of the skb payload. Reproduced on v7.0-rc7 with that fix applied, QEMU/KVM with CONFIG_RDMA_RXE=y and CONFIG_KASAN=y, after rdma link add rxe0 type rxe netdev eth0 A single 48-byte UDP packet to port 4791 with BTH opcode=0xff and QPN=IB_MULTICAST_QPN triggers: BUG: KASAN: slab-out-of-bounds in crc32_le+0x115/0x170 Read of size 1 at addr ... The buggy address is located 0 bytes to the right of allocated 704-byte region Call Trace: crc32_le+0x115/0x170 rxe_icrc_hdr.isra.0+0x226/0x300 rxe_icrc_check+0x13f/0x3a0 rxe_rcv+0x6e1/0x16e0 rxe_udp_encap_recv+0x20a/0x320 udp_queue_rcv_one_skb+0x7ed/0x12c0 Subsequent packets with the same shape fault on unmapped memory and panic the kernel. The trigger requires only module load and "rdma link add"; no QP, no connection, and no authentication. Fix this by rejecting packets whose opcode has no rxe_opcode[] entry, detected via the zero mask or zero length, before any length arithmetic runs. Cc: stable@vger.kernel.org Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://patch.msgid.link/r/20260414111555.3386793-1-michael.bommarito@gmail.com Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Michael Bommarito Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_recv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index f79214738c2b..2d5e701ff961 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -330,6 +330,17 @@ void rxe_rcv(struct sk_buff *skb) pkt->qp = NULL; pkt->mask |= rxe_opcode[pkt->opcode].mask; + /* + * Unknown opcodes have a zero-initialized rxe_opcode[] entry, so + * both mask and length are 0. Reject them before any length math: + * rxe_icrc_hdr() would otherwise compute length - RXE_BTH_BYTES + * and pass the underflowed value to rxe_crc32(), producing an + * out-of-bounds read. + */ + if (unlikely(!rxe_opcode[pkt->opcode].mask || + !rxe_opcode[pkt->opcode].length)) + goto drop; + if (unlikely(pkt->paylen < header_size(pkt) + bth_pad(pkt) + RXE_ICRC_SIZE)) goto drop; From 1114c87aa6f195cf07da55a27b2122ae26557b26 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 18 Apr 2026 12:21:41 -0400 Subject: [PATCH 03/24] RDMA/rxe: Reject non-8-byte ATOMIC_WRITE payloads atomic_write_reply() at drivers/infiniband/sw/rxe/rxe_resp.c unconditionally dereferences 8 bytes at payload_addr(pkt): value = *(u64 *)payload_addr(pkt); check_rkey() previously accepted an ATOMIC_WRITE request with pktlen == resid == 0 because the length validation only compared pktlen against resid. A remote initiator that sets the RETH length to 0 therefore reaches atomic_write_reply() with a zero-byte logical payload, and the responder reads sizeof(u64) bytes from past the logical end of the packet into skb->head tailroom, then writes those 8 bytes into the attacker's MR via rxe_mr_do_atomic_write(). That is a remote disclosure of 4 bytes of kernel tailroom per probe (the other 4 bytes are the packet's own trailing ICRC). IBA oA19-28 defines ATOMIC_WRITE as exactly 8 bytes. Anything else is protocol-invalid. Hoist a strict length check into check_rkey() so the responder never reaches the unchecked dereference, and keep the existing WRITE-family length logic for the normal RDMA WRITE path. Reproduced on mainline with an unmodified rxe driver: a sustained zero-length ATOMIC_WRITE probe repeatedly leaks adjacent skb head-buffer bytes into the attacker's MR, including recognisable kernel strings and partial kernel-direct-map pointer words. With this patch applied the responder rejects the PDU and the MR stays all-zero. Cc: stable@vger.kernel.org Fixes: 034e285f8b99 ("RDMA/rxe: Make responder support atomic write on RC service") Link: https://patch.msgid.link/r/20260418162141.3610201-1-michael.bommarito@gmail.com Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 9faf8c09aa8e..9cb2f6fbf2dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -540,7 +540,19 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } skip_check_range: - if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { + if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + /* IBA oA19-28: ATOMIC_WRITE payload is exactly 8 bytes. + * Reject any other length before the responder reads + * sizeof(u64) bytes from payload_addr(pkt); a shorter + * payload would read past the logical end of the packet + * into skb->head tailroom. + */ + if (resid != sizeof(u64) || pktlen != sizeof(u64) || + bth_pad(pkt)) { + state = RESPST_ERR_LENGTH; + goto err; + } + } else if (pkt->mask & RXE_WRITE_MASK) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; From c488df06bd552bb8b6e14fa0cfd5ad986c6e9525 Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Fri, 24 Apr 2026 13:51:02 +0800 Subject: [PATCH 04/24] RDMA/mlx5: Fix error path fall-through in mlx5_ib_dev_res_srq_init() mlx5_ib_dev_res_srq_init() allocates two SRQs, s0 and s1. When ib_create_srq() fails for s1, the error branch destroys s0 but falls through and unconditionally assigns the freed s0 and the ERR_PTR s1 to devr->s0 and devr->s1. This leads to several problems: the lock-free fast path checks "if (devr->s1) return 0;" and treats the ERR_PTR as already initialised; users in mlx5_ib_create_qp() dereference the freed SRQ or ERR_PTR via to_msrq(devr->s0)->msrq.srqn; and mlx5_ib_dev_res_cleanup() dereferences the ERR_PTR and double-frees s0 on teardown. Fix by adding the same `goto unlock` in the s1 failure path. Cc: stable@vger.kernel.org Fixes: 5895e70f2e6e ("IB/mlx5: Allocate resources just before first QP/SRQ is created") Link: https://patch.msgid.link/r/SYBPR01MB7881E1E0970268BD69C0BA75AF2B2@SYBPR01MB7881.ausprd01.prod.outlook.com Reported-by: Yuhao Jiang Signed-off-by: Junrui Luo Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 109661c2ac12..8115ae869ef2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3392,6 +3392,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) "Couldn't create SRQ 1 for res init, err=%pe\n", s1); ib_destroy_srq(s0); + goto unlock; } devr->s0 = s0; From 38694f4639c45599161860e828dc4ac77abf8cea Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Mon, 27 Apr 2026 14:02:32 +0300 Subject: [PATCH 05/24] RDMA/mlx5: Fix UAF in SRQ destroy due to race with create A race condition exists between mlx5_cmd_destroy_srq() and mlx5_cmd_create_srq() that can lead to a use-after-free (UAF) [1]. After destroy_srq_split() releases the SRQ to firmware, the SRQN can be immediately reallocated for a new SRQ being created concurrently. If the create path stores the new SRQ in the xarray before the destroy path erases it, the destroy will incorrectly delete the new SRQ's entry. Later accesses then hit freed memory. Fix by replacing the unconditional xa_erase_irq() with xa_cmpxchg_irq() that only erases the entry if it hasn't already been replaced (still contains XA_ZERO_ENTRY), preserving any newly created SRQ. [1] RIP: 0010:mlx5_cmd_destroy_srq+0xd8/0x110 [mlx5_ib] Code: 89 e1 ba 06 04 00 00 4c 89 f6 48 89 ef e8 80 19 70 e1 c6 83 a0 0f 00 00 00 fb 5b 44 89 e8 5d 41 5c 41 5d 41 5e c3 cc cc cc cc <0f> 0b 48 89 c2 83 e2 03 48 83 fa 02 75 08 48 3d 05 c0 ff ff 77 08 RSP: 0018:ff110001037b7d08 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ff1100010bb9c000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff110001037b7c90 RBP: ff1100010bb9cfa0 R08: 0000000000000000 R09: 0000000000000000 R10: ff110001037b7da0 R11: ff11000104f29580 R12: ff1100010e2ac090 R13: 000000000000000d R14: 0000000000000001 R15: ff11000105336300 FS: 00007fa24787c740(0000) GS:ff1100046eb8d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa247984e90 CR3: 0000000109d59005 CR4: 0000000000373eb0 Call Trace: mlx5_ib_destroy_srq+0x25/0xa0 [mlx5_ib] ib_destroy_srq_user+0x21/0x90 [ib_core] uverbs_free_srq+0x1b/0x50 [ib_uverbs] destroy_hw_idr_uobject+0x1e/0x50 [ib_uverbs] uverbs_destroy_uobject+0x35/0x180 [ib_uverbs] __uverbs_cleanup_ufile+0xdd/0x140 [ib_uverbs] uverbs_destroy_ufile_hw+0x38/0xf0 [ib_uverbs] ib_uverbs_close+0x17/0xa0 [ib_uverbs] __fput+0xe0/0x2a0 __x64_sys_close+0x3a/0x80 do_syscall_64+0x55/0xac0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fa247984ea4 Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 80 3d a5 51 0e 00 00 74 13 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3c c3 0f 1f 00 55 48 89 e5 48 83 ec 10 89 7d RSP: 002b:00007ffecfa79498 EFLAGS: 00000202 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 0000200000000080 RCX: 00007fa247984ea4 RDX: 0000000000000040 RSI: 0000200000000200 RDI: 0000000000000003 RBP: 00007ffecfa794e0 R08: 00007ffecfa794e0 R09: 00007ffecfa794e0 R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 0000200000000000 R15: 0000200000000009 ---[ end trace 0000000000000000 ]--- Fixes: fd89099d635e ("RDMA/mlx5: Issue FW command to destroy SRQ on reentry") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-1-4621fa52de0e@nvidia.com Signed-off-by: Edward Srouji Reviewed-by: Michael Guralnik Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/srq_cmd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index 8b3385396599..c1a088120915 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -683,7 +683,14 @@ int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) xa_cmpxchg_irq(&table->array, srq->srqn, XA_ZERO_ENTRY, srq, 0); return err; } - xa_erase_irq(&table->array, srq->srqn); + + /* + * A race can occur where a concurrent create gets the same srqn + * (after hardware released it) and overwrites XA_ZERO_ENTRY with + * its new SRQ before we reach here. In that case, we must not erase + * the entry as it now belongs to the new SRQ. + */ + xa_cmpxchg_irq(&table->array, srq->srqn, XA_ZERO_ENTRY, NULL, 0); mlx5_core_res_put(&srq->common); wait_for_completion(&srq->common.free); From 9bee81cc5e8811c8bbe67fbf5214a7998457324b Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Mon, 27 Apr 2026 14:02:33 +0300 Subject: [PATCH 06/24] RDMA/mlx5: Fix UAF in DCT destroy due to race with create A potential race condition exists between mlx5_core_destroy_dct() and mlx5_core_create_dct() that can lead to a use-after-free. After _mlx5_core_destroy_dct() releases the DCT to firmware, the DCTN can be immediately reallocated for a new DCT being created concurrently. If the create path stores the new DCT in the xarray before the destroy path erases it, the destroy will incorrectly delete the new DCT's entry. Later accesses then hit freed memory. Fix by replacing the unconditional xa_erase_irq() with xa_cmpxchg_irq() that only erases the entry if it hasn't already been replaced (still contains XA_ZERO_ENTRY), preserving any newly created DCT. Fixes: afff24899846 ("RDMA/mlx5: Handle DCT QP logic separately from low level QP interface") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-2-4621fa52de0e@nvidia.com Signed-off-by: Edward Srouji Reviewed-by: Michael Guralnik Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qpc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 146d03ae40bd..a7a4f9420271 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -314,7 +314,14 @@ int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, XA_ZERO_ENTRY, dct, 0); return err; } - xa_erase_irq(&table->dct_xa, dct->mqp.qpn); + + /* + * A race can occur where a concurrent create gets the same dctn + * (after hardware released it) and overwrites XA_ZERO_ENTRY with + * its new DCT before we reach here. In that case, we must not erase + * the entry as it now belongs to the new DCT. + */ + xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, XA_ZERO_ENTRY, NULL, 0); return 0; } From 610771c62e2ac5bca851fc5a6f8af1cdd83f189a Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 27 Apr 2026 14:02:34 +0300 Subject: [PATCH 07/24] IB/core: Fix IPv6 netlink message size in ib_nl_ip_send_msg() When resolving an RDMA-CM IPv6 address, ib_nl_ip_send_msg() sends a netlink request to the userspace daemon to perform IP-to-GID resolution in certain cases. The function allocates the netlink message buffer using nla_total_size(sizeof(size)), which passes 8 bytes (the size of size_t) instead of 16 bytes (the size of an IPv6 address). This results in an 8-byte under-allocation. This is currently masked by nlmsg_new() over-allocation of the skb in its internal logic. However, the code remains incorrect. Fix the issue by supplying the proper IPv6 address length to nla_total_size(). Fixes: ae43f8286730 ("IB/core: Add IP to GID netlink offload") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-3-4621fa52de0e@nvidia.com Signed-off-by: Maher Sanalla Reviewed-by: Patrisious Haddad Signed-off-by: Edward Srouji Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index a40a765f0307..27992c38ad90 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -149,7 +149,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; } - len = nla_total_size(sizeof(size)); + len = nla_total_size(size); len += NLMSG_ALIGN(sizeof(*header)); skb = nlmsg_new(len, GFP_KERNEL); From 1f3b337af2231b1e83c9052f771b201f5cbb9997 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 27 Apr 2026 14:02:35 +0300 Subject: [PATCH 08/24] RDMA/core: Fix rereg_mr use-after-free race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a driver creates a new MR during rereg_user_mr, a race window exists between rdma_alloc_commit_uobject() for the new MR and the point where the code reads that MR to populate the response keys. A concurrent rereg_mr or destroy_mr could destroy the MR in this window and cause UAF in the first thread. Racing flow between two rereg_mr calls: CPU0 CPU1 ---- ---- rereg_user_mr(mr_handle) uobj_get_write(mr_handle) -> mr0 mr1 = driver→rereg() rdma_alloc_commit_uobject(mr1) // mr1 replaced mr0 and is unlocked uobj_put_destroy(mr0) rereg_user_mr(mr_handle) uobj_get_write(mr_handle) -> mr1 mr2 = driver→rereg() rdma_alloc_commit_uobject(mr2) // mr2 replaced mr1 and is unlocked uobj_put_destroy(mr1) // Destroys mr1! resp.lkey = mr1->lkey; // UAF - mr1 was freed! resp.rkey = mr1->rkey; // UAF - mr1 was freed! Fix by storing lkey/rkey in local variables before the new MR is unlocked and using the local variables to set the user response. Fixes: 6e0954b11c05 ("RDMA/uverbs: Allow drivers to create a new HW object during rereg_mr") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-4-4621fa52de0e@nvidia.com Signed-off-by: Michael Guralnik Reviewed-by: Maher Sanalla Signed-off-by: Edward Srouji Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a768436ba468..91a62d2ade4d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -778,6 +778,7 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) struct ib_pd *orig_pd; struct ib_pd *new_pd; struct ib_mr *new_mr; + u32 lkey, rkey; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) @@ -846,6 +847,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) new_mr->uobject = uobj; atomic_inc(&new_pd->usecnt); new_uobj->object = new_mr; + lkey = new_mr->lkey; + rkey = new_mr->rkey; rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR); rdma_restrack_set_name(&new_mr->res, NULL); @@ -871,11 +874,13 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) mr->iova = cmd.hca_va; mr->length = cmd.length; } + lkey = mr->lkey; + rkey = mr->rkey; } memset(&resp, 0, sizeof(resp)); - resp.lkey = mr->lkey; - resp.rkey = mr->rkey; + resp.lkey = lkey; + resp.rkey = rkey; ret = uverbs_response(attrs, &resp, sizeof(resp)); From 6009cca96fcb01182cede725ad61e9e3810f3932 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 27 Apr 2026 14:02:36 +0300 Subject: [PATCH 09/24] RDMA/mlx5: Fix null-ptr-deref in Raw Packet QP creation Raw Packet QPs are unique in that they support separate send and receive queues, using 2 different user-provided buffers. They can also be created with one of the queues having size 0, allowing a send-only or receive-only QP. The Raw Packet RQ umem is created in the common user QP creation path, which allows zero-length queues. Add a later validation of the RQ umem in Raw Packet QP creation path when an RQ was requested. This prevents possible null-ptr dereference crashes, as seen in the below trace: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 6 UID: 0 PID: 3539 Comm: raw_packet_umem Not tainted 6.19.0-rc1+ #166 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:__mlx5_umem_find_best_quantized_pgoff+0x37/0x280 [mlx5_ib] Code: ff df 41 57 49 89 ff 41 56 41 55 41 89 d5 41 54 4d 89 cc 4c 8d 4f 30 55 4c 89 ca 48 89 f5 53 48 c1 ea 03 48 89 cb 48 83 ec 18 <80> 3c 02 00 44 89 04 24 0f 85 01 02 00 00 48 ba 00 00 00 00 00 fc RSP: 0018:ff1100013966f4e0 EFLAGS: 00010282 RAX: dffffc0000000000 RBX: 00000000ffffffc0 RCX: 00000000ffffffc0 RDX: 0000000000000006 RSI: 00000ffffffff000 RDI: 0000000000000000 RBP: 00000ffffffff000 R08: 0000000000000040 R09: 0000000000000030 R10: 0000000000000000 R11: 0000000000000000 R12: ff1100013966f648 R13: 0000000000000005 R14: ff1100013966f980 R15: 0000000000000000 FS: 00007fae6c82f740(0000) GS:ff11000898ba1000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000000 CR3: 000000010f96c005 CR4: 0000000000373eb0 Call Trace: create_qp+0x747d/0xc740 [mlx5_ib] ? is_module_address+0x18/0x110 ? _create_user_qp.constprop.0+0x18e0/0x18e0 [mlx5_ib] ? __module_address+0x49/0x210 ? is_module_address+0x68/0x110 ? static_obj+0x67/0x90 ? lockdep_init_map_type+0x58/0x200 mlx5_ib_create_qp+0xc85/0x2620 [mlx5_ib] ? find_held_lock+0x2b/0x80 ? create_qp+0xc740/0xc740 [mlx5_ib] ? lock_release+0xcb/0x260 ? lockdep_init_map_type+0x58/0x200 ? __init_swait_queue_head+0xcb/0x150 create_qp.part.0+0x558/0x7c0 [ib_core] ib_create_qp_user+0xa0/0x4f0 [ib_core] ? rdma_lookup_get_uobject+0x1e4/0x400 [ib_uverbs] create_qp+0xe4f/0x1d10 [ib_uverbs] ? ib_uverbs_rereg_mr+0xd40/0xd40 [ib_uverbs] ? ib_uverbs_cq_event_handler+0x120/0x120 [ib_uverbs] ? __might_fault+0x81/0x100 ? lock_release+0xcb/0x260 ? _copy_from_user+0x3e/0x90 ib_uverbs_create_qp+0x10a/0x150 [ib_uverbs] ? ib_uverbs_ex_create_qp+0xe0/0xe0 [ib_uverbs] ? __might_fault+0x81/0x100 ? lock_release+0xcb/0x260 ib_uverbs_write+0x7e5/0xc90 [ib_uverbs] ? uverbs_devnode+0xc0/0xc0 [ib_uverbs] ? lock_acquire+0xfa/0x2b0 ? find_held_lock+0x2b/0x80 ? finish_task_switch.isra.0+0x189/0x6c0 vfs_write+0x1c0/0xf70 ? lockdep_hardirqs_on_prepare+0xde/0x170 ? kernel_write+0x5a0/0x5a0 ? __switch_to+0x527/0xe60 ? __schedule+0x10a3/0x3950 ? io_schedule_timeout+0x110/0x110 ksys_write+0x170/0x1c0 ? __x64_sys_read+0xb0/0xb0 ? trace_hardirqs_off.part.0+0x4e/0xe0 do_syscall_64+0x70/0x1360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fae6ca3118d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 5b cc 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe678ca308 EFLAGS: 00000213 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007ffe678ca448 RCX: 00007fae6ca3118d RDX: 0000000000000070 RSI: 0000200000000280 RDI: 0000000000000003 RBP: 00007ffe678ca320 R08: 00000000ffffffff R09: 00007fae6c8ec5b8 R10: 0000000000000064 R11: 0000000000000213 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fae6cb71000 R15: 0000000000404df0 Modules linked in: mlx5_ib mlx5_fwctl mlx5_core bonding ip6_gre ip6_tunnel tunnel6 ip_gre gre rdma_ucm ib_uverbs rdma_cm iw_cm ib_ipoib ib_cm ib_umad ib_core rpcsec_gss_krb5 auth_rpcgss oid_registry overlay nfnetlink zram zsmalloc fuse scsi_transport_iscsi [last unloaded: mlx5_core] ---[ end trace 0000000000000000 ]--- RIP: 0010:__mlx5_umem_find_best_quantized_pgoff+0x37/0x280 [mlx5_ib] Fixes: 0fb2ed66a14c ("IB/mlx5: Add create and destroy functionality for Raw Packet QP") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-5-4621fa52de0e@nvidia.com Signed-off-by: Michael Guralnik Reviewed-by: Maher Sanalla Signed-off-by: Edward Srouji Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8f50e7342a76..1611a704c1b3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1603,6 +1603,11 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (qp->rq.wqe_cnt) { + if (!rq->base.ubuffer.umem) { + err = -EINVAL; + goto err_destroy_sq; + } + rq->base.container_mibqp = qp; if (qp->flags & IB_QP_CREATE_CVLAN_STRIPPING) From 70f780edcd1e86350202d8a409de026b2d2e2067 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:34 -0300 Subject: [PATCH 10/24] RDMA/ionic: Fix typo in format string Applying the corrupted patch by hand mangled the format string, put the s in the right place. Cc: stable@vger.kernel.org Fixes: 654a27f25530 ("RDMA/ionic: bound node_desc sysfs read with %.64s") Link: https://patch.msgid.link/r/1-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reported-by: Brad Spengler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ionic/ionic_ibdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c index 0382a64839d2..73a616ae3502 100644 --- a/drivers/infiniband/hw/ionic/ionic_ibdev.c +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c @@ -185,7 +185,7 @@ static ssize_t hca_type_show(struct device *device, struct ionic_ibdev *dev = rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev); - return sysfs_emit(buf, "%s.64\n", dev->ibdev.node_desc); + return sysfs_emit(buf, "%.64s\n", dev->ibdev.node_desc); } static DEVICE_ATTR_RO(hca_type); From 641858d52f2372124d9312a407e2124915d846ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:35 -0300 Subject: [PATCH 11/24] RDMA/mlx5: Restore zero-init to mlx5_ib_modify_qp() ucmd Sashiko points out the check for inlen==0 got missed, the ={} was not redundant, put it back. Fixes: a9cd442a5347 ("RDMA: Remove redundant = {} for udata req structs") Link: https://patch.msgid.link/r/2-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1611a704c1b3..8fd05532c09c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4697,7 +4697,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_modify_qp_resp resp = {}; struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_ib_modify_qp ucmd; + struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; From 45e8ebc9ede73543c55d597bb53b6bbb7e8b7327 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:36 -0300 Subject: [PATCH 12/24] RDMA/mlx5: Add missing store/release for lock elision pattern mlx5 has a common pattern implementing a device-global singleton resource where it checks the resource pointer for !NULL and then skips obtaining the lock. This is not ordered properly as observing !NULL doesn't mean that all the data under that pointer is also visible on this CPU when the lock is not taken. Use a release/acquire pairing to explicitly manage this. Pointed out by sashiko, Codex found more cases. Fixes: 5895e70f2e6e ("IB/mlx5: Allocate resources just before first QP/SRQ is created") Fixes: 638420115cc4 ("IB/mlx5: Create UMR QP just before first reg_mr occurs") Link: https://sashiko.dev/#/patchset/SYBPR01MB7881E1E0970268BD69C0BA75AF2B2%40SYBPR01MB7881.ausprd01.prod.outlook.com Link: https://patch.msgid.link/r/3-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Assisted-by: Codex:GPT-5.5 Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++---- drivers/infiniband/hw/mlx5/umr.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8115ae869ef2..61078281953d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3310,7 +3310,7 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) * devr->c0 is set once, never changed until device unload. * Avoid taking the mutex if initialization is already done. */ - if (devr->c0) + if (smp_load_acquire(&devr->c0)) return 0; mutex_lock(&devr->cq_lock); @@ -3336,7 +3336,7 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) } devr->p0 = pd; - devr->c0 = cq; + smp_store_release(&devr->c0, cq); unlock: mutex_unlock(&devr->cq_lock); @@ -3354,7 +3354,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) * devr->s1 is set once, never changed until device unload. * Avoid taking the mutex if initialization is already done. */ - if (devr->s1) + if (smp_load_acquire(&devr->s1)) return 0; mutex_lock(&devr->srq_lock); @@ -3396,7 +3396,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) } devr->s0 = s0; - devr->s1 = s1; + smp_store_release(&devr->s1, s1); unlock: mutex_unlock(&devr->srq_lock); diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 29488fba21a0..f2139474be37 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -147,7 +147,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) * UMR qp is set once, never changed until device unload. * Avoid taking the mutex if initialization is already done. */ - if (dev->umrc.qp) + if (smp_load_acquire(&dev->umrc.qp)) return 0; mutex_lock(&dev->umrc.init_lock); @@ -185,7 +185,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) sema_init(&dev->umrc.sem, MAX_UMR_WR); mutex_init(&dev->umrc.lock); dev->umrc.state = MLX5_UMR_STATE_ACTIVE; - dev->umrc.qp = qp; + smp_store_release(&dev->umrc.qp, qp); mutex_unlock(&dev->umrc.init_lock); return 0; From 6dd2d4ad9c8429523b1c220c5132bd551c006425 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:37 -0300 Subject: [PATCH 13/24] RDMA/mana: Validate rx_hash_key_len Sashiko points out that rx_hash_key_len comes from a uAPI structure and is blindly passed to memcpy, allowing the userspace to trash kernel memory. Bounds check it so the memcpy cannot overflow. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1 Link: https://patch.msgid.link/r/4-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 645581359cee..f7bb0d1f0f80 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -21,6 +21,9 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, gc = mdev_to_gc(dev); + if (rx_hash_key_len > sizeof(req->hashkey)) + return -EINVAL; + req_buf_size = struct_size(req, indir_tab, MANA_INDIRECT_TABLE_DEF_SIZE); req = kzalloc(req_buf_size, GFP_KERNEL); if (!req) From 159f2efabc89d3f931d38f2d35876535d4abf0a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:38 -0300 Subject: [PATCH 14/24] RDMA/mana: Remove user triggerable WARN_ON() in mana_ib_create_qp_rss() Sashiko points out that the user can specify WQs sharing the same CQ as a part of the uAPI and this will trigger the WARN_ON() then go on to corrupt the kernel. Just reject it outright and fail the QP creation. Cc: stable@vger.kernel.org Fixes: c15d7802a424 ("RDMA/mana_ib: Add CQ interrupt support for RAW QP") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1 Link: https://patch.msgid.link/r/5-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/cq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index f4cbe21763bf..2d682428ef20 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -137,8 +137,9 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) if (cq->queue.id >= gc->max_num_cqs) return -EINVAL; - /* Create CQ table entry */ - WARN_ON(gc->cq_table[cq->queue.id]); + /* Create CQ table entry, sharing a CQ between WQs is not supported */ + if (gc->cq_table[cq->queue.id]) + return -EINVAL; if (cq->queue.kmem) gdma_cq = cq->queue.kmem; else From 34ecf795692ee57c393109f4a24ccc313091e137 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:39 -0300 Subject: [PATCH 15/24] RDMA/mana: Fix mana_destroy_wq_obj() cleanup in mana_ib_create_qp_rss() Sashiko points out there are two bugs here in the error unwind flow, both related to how the WQ table is unwound. First there is a double i-- on the first failure path due to the while loop having a i--, remove it. Second if mana_ib_install_cq_cb() fails then mana_create_wq_obj() is not undone due to the above i--. Cc: stable@vger.kernel.org Fixes: c15d7802a424 ("RDMA/mana_ib: Add CQ interrupt support for RAW QP") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1 Link: https://patch.msgid.link/r/6-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index f7bb0d1f0f80..8e1f052d0ec9 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -176,11 +176,8 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, &wq_spec, &cq_spec, &wq->rx_object); - if (ret) { - /* Do cleanup starting with index i-1 */ - i--; + if (ret) goto fail; - } /* The GDMA regions are now owned by the WQ object */ wq->queue.gdma_region = GDMA_INVALID_DMA_REGION; @@ -200,8 +197,10 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, /* Create CQ table entry */ ret = mana_ib_install_cq_cb(mdev, cq); - if (ret) + if (ret) { + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); goto fail; + } } resp.num_entries = i; From 6aaa978c6b6218cfac15fe1dab17c76fe229ce3f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:40 -0300 Subject: [PATCH 16/24] RDMA/mana: Fix error unwind in mana_ib_create_qp_rss() Sashiko points out that mana_ib_cfg_vport_steering() is leaked, the normal destroy path cleans it up. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/7-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 8e1f052d0ec9..0fbcf449c134 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -217,13 +217,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata create rss-qp, %d\n", ret); - goto fail; + goto err_disable_vport_rx; } kfree(mana_ind_table); return 0; +err_disable_vport_rx: + mana_disable_vport_rx(mpc); fail: while (i-- > 0) { ibwq = ind_tbl->ind_tbl[i]; From ea4e4b168a531c522d2850719816b6f583b1738b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:41 -0300 Subject: [PATCH 17/24] RDMA/ocrdma: Clarify the mm_head searching The intention of this code is to find matching entries exactly, the driver never creates phys_addr's with different lens so the current expression is not a bug, but it doesn't make sense and confuses review tooling. Search for exact match instead. Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/8-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c17e2a54dbca..463c9a5703fc 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -215,7 +215,7 @@ static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, mutex_lock(&uctx->mm_list_lock); list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { - if (len != mm->key.len && phy_addr != mm->key.phy_addr) + if (len != mm->key.len || phy_addr != mm->key.phy_addr) continue; list_del(&mm->entry); @@ -233,7 +233,7 @@ static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, mutex_lock(&uctx->mm_list_lock); list_for_each_entry(mm, &uctx->mm_head, entry) { - if (len != mm->key.len && phy_addr != mm->key.phy_addr) + if (len != mm->key.len || phy_addr != mm->key.phy_addr) continue; found = true; From 34fbf48cf3b410d2a6e8c586fa952a36331ca5ba Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:42 -0300 Subject: [PATCH 18/24] RDMA/ocrdma: Don't NULL deref uctx on errors in ocrdma_copy_pd_uresp() Sashiko points out that pd->uctx isn't initialized until late in the function so all these error flow references are NULL and will crash. Use the uctx that isn't NULL. Cc: stable@vger.kernel.org Fixes: fe2caefcdf58 ("RDMA/ocrdma: Add driver for Emulex OneConnect IBoE RDMA adapter") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/9-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 463c9a5703fc..a88cc5d84af8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -620,9 +620,9 @@ static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd, ucopy_err: if (pd->dpp_enabled) - ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE); + ocrdma_del_mmap(uctx, dpp_page_addr, PAGE_SIZE); dpp_map_err: - ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size); + ocrdma_del_mmap(uctx, db_page_addr, db_page_size); return status; } From e38e86995df27f1f854063dab1f0c6a513db3faf Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:43 -0300 Subject: [PATCH 19/24] RDMA/vmw_pvrdma: Fix double free on pvrdma_alloc_ucontext() error path Sashiko points out that pvrdma_uar_free() is already called within pvrdma_dealloc_ucontext(), so calling it before triggers a double free. Cc: stable@vger.kernel.org Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/10-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index bcd43dc30e21..c7c2b41060e5 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -322,7 +322,7 @@ int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) uresp.qp_tab_size = vdev->dsr->caps.max_qp; ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) { - pvrdma_uar_free(vdev, &context->uar); + /* pvrdma_dealloc_ucontext() also frees the UAR */ pvrdma_dealloc_ucontext(&context->ibucontext); return -EFAULT; } From c54c7e4cb679c0aaa1cb489b9c3f2cd98e63a44c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:44 -0300 Subject: [PATCH 20/24] RDMA/mlx4: Fix resource leak on error in mlx4_ib_create_srq() Sashiko points out that mlx4_srq_alloc() was not undone during error unwind, add the missing call to mlx4_srq_free(). Cc: stable@vger.kernel.org Fixes: 225c7b1feef1 ("IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=8 Link: https://patch.msgid.link/r/11-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/srq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 5b23e5f8b84a..767840736d58 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -194,13 +194,15 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq, if (udata) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { err = -EFAULT; - goto err_wrid; + goto err_srq; } init_attr->attr.max_wr = srq->msrq.max - 1; return 0; +err_srq: + mlx4_srq_free(dev->dev, &srq->msrq); err_wrid: if (udata) mlx4_ib_db_unmap_user(ucontext, &srq->db); From c9341307ea16b9395c2e4c9c94d8499d91fe31d0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:45 -0300 Subject: [PATCH 21/24] RDMA/mlx4: Fix mis-use of RCU in mlx4_srq_event() Sashiko points out the radix_tree itself is RCU safe, but nothing ever frees the mlx4_srq struct with RCU, and it isn't even accessed within the RCU critical section. It also will crash if an event is delivered before the srq object is finished initializing. Use the spinlock since it isn't easy to make RCU work, use refcount_inc_not_zero() to protect against partially initialized objects, and order the refcount_set() to be after the srq is fully initialized. Cc: stable@vger.kernel.org Fixes: 30353bfc43a1 ("net/mlx4_core: Use RCU to perform radix tree lookup for SRQ") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=5 Link: https://patch.msgid.link/r/12-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx4/srq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c index dd890f5d7b72..8711689120f3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/srq.c +++ b/drivers/net/ethernet/mellanox/mlx4/srq.c @@ -44,13 +44,14 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; struct mlx4_srq *srq; + unsigned long flags; - rcu_read_lock(); + spin_lock_irqsave(&srq_table->lock, flags); srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1)); - rcu_read_unlock(); - if (srq) - refcount_inc(&srq->refcount); - else { + if (!srq || !refcount_inc_not_zero(&srq->refcount)) + srq = NULL; + spin_unlock_irqrestore(&srq_table->lock, flags); + if (!srq) { mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn); return; } @@ -203,8 +204,8 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, if (err) goto err_radix; - refcount_set(&srq->refcount, 1); init_completion(&srq->free); + refcount_set_release(&srq->refcount, 1); return 0; From 48973c6c938737bb900d15dc82b91dfe3586cb0f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:46 -0300 Subject: [PATCH 22/24] RDMA/hns: Fix xarray race in hns_roce_create_srq() Sashiko points out that once the srq memory is stored into the xarray by alloc_srqc() it can immediately be looked up by: xa_lock(&srq_table->xa); srq = xa_load(&srq_table->xa, srqn & (hr_dev->caps.num_srqs - 1)); if (srq) refcount_inc(&srq->refcount); xa_unlock(&srq_table->xa); Which will fail refcount debug because the refcount is 0 and then crash: srq->event(srq, event_type); Because event is NULL. Use refcount_inc_not_zero() instead to ensure a partially prepared srq is never retrieved from the event handler and fix the ordering of the initialization so refcount becomes 1 only after it is fully ready. All the initialization must be done before calling free_srqc() since it depends on the completion and refcount. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=3 Link: https://patch.msgid.link/r/13-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_srq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index cb848e8e6bbd..8b94cbdfa54d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -16,8 +16,8 @@ void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type) xa_lock(&srq_table->xa); srq = xa_load(&srq_table->xa, srqn & (hr_dev->caps.num_srqs - 1)); - if (srq) - refcount_inc(&srq->refcount); + if (srq && !refcount_inc_not_zero(&srq->refcount)) + srq = NULL; xa_unlock(&srq_table->xa); if (!srq) { @@ -470,6 +470,10 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, if (ret) goto err_srqn; + srq->event = hns_roce_ib_srq_event; + init_completion(&srq->free); + refcount_set_release(&srq->refcount, 1); + if (udata) { resp.cap_flags = srq->cap_flags; resp.srqn = srq->srqn; @@ -480,10 +484,6 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, } } - srq->event = hns_roce_ib_srq_event; - refcount_set(&srq->refcount, 1); - init_completion(&srq->free); - return 0; err_srqc: From 7d51783d82fea000a9ce96fa1dcf3e0a8cedc4fb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:47 -0300 Subject: [PATCH 23/24] RDMA/hns: Fix xarray race in hns_roce_create_qp_common() Similar to the SRQ case the hr_qp is stored in the xarray before it is fully initialized. Unlike the SRQ case the error unwinds do not wait for the completion so keep the refcount 0 until the function succeeds. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Link: https://patch.msgid.link/r/14-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Suggested-by: Junxian Huang Reviewed-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index a27ea85bb063..f94ba98871f0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -47,8 +47,8 @@ static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, xa_lock_irqsave(&hr_dev->qp_table_xa, flags); qp = __hns_roce_qp_lookup(hr_dev, qpn); - if (qp) - refcount_inc(&qp->refcount); + if (qp && !refcount_inc_not_zero(&qp->refcount)) + qp = NULL; xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags); if (!qp) @@ -1251,8 +1251,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp->ibqp.qp_num = hr_qp->qpn; hr_qp->event = hns_roce_ib_qp_event; - refcount_set(&hr_qp->refcount, 1); init_completion(&hr_qp->free); + refcount_set_release(&hr_qp->refcount, 1); return 0; From 0c99acbc8b6c6dd526ae475a48ee1897b61072fb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:48 -0300 Subject: [PATCH 24/24] RDMA/hns: Fix unlocked call to hns_roce_qp_remove() Sashiko points out that hns_roce_qp_remove() requires the caller to hold locks. The error flow in hns_roce_create_qp_common() doesn't hold those locks for the error unwind so it risks corrupting memory. Grab the same locks the other two callers use. Cc: stable@vger.kernel.org Fixes: e088a685eae9 ("RDMA/hns: Support rq record doorbell for the user space") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=9 Link: https://patch.msgid.link/r/15-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f94ba98871f0..bf04ee84a943 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1171,6 +1171,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, struct hns_roce_ib_create_qp_resp resp = {}; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_ib_create_qp ucmd = {}; + unsigned long flags; int ret; mutex_init(&hr_qp->mutex); @@ -1257,7 +1258,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, return 0; err_flow_ctrl: + spin_lock_irqsave(&hr_dev->qp_list_lock, flags); + hns_roce_lock_cqs(init_attr->send_cq ? to_hr_cq(init_attr->send_cq) : NULL, + init_attr->recv_cq ? to_hr_cq(init_attr->recv_cq) : NULL); hns_roce_qp_remove(hr_dev, hr_qp); + hns_roce_unlock_cqs(init_attr->send_cq ? to_hr_cq(init_attr->send_cq) : NULL, + init_attr->recv_cq ? to_hr_cq(init_attr->recv_cq) : NULL); + spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags); err_store: free_qpc(hr_dev, hr_qp); err_qpc: