From 3a73746b267e5c6a87c9ad26f8c6a48e44da609c Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Tue, 20 Dec 2022 17:08:47 +0900 Subject: [PATCH 1/9] RDMA/rxe: Fix inaccurate constants in rxe_type_info ibv_query_device() has reported incorrect device attributes, which are actually not used by the device. Make the constants correspond with the attributes shown to users. Fixes: 3ccffe8abf2f ("RDMA/rxe: Move max_elem into rxe_type_info") Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by xarrays") Link: https://lore.kernel.org/r/20221220080848.253785-1-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_pool.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index f50620f5a0a1..1151c0b5ccea 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -23,16 +23,16 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_ucontext), .elem_offset = offsetof(struct rxe_ucontext, elem), .min_index = 1, - .max_index = UINT_MAX, - .max_elem = UINT_MAX, + .max_index = RXE_MAX_UCONTEXT, + .max_elem = RXE_MAX_UCONTEXT, }, [RXE_TYPE_PD] = { .name = "pd", .size = sizeof(struct rxe_pd), .elem_offset = offsetof(struct rxe_pd, elem), .min_index = 1, - .max_index = UINT_MAX, - .max_elem = UINT_MAX, + .max_index = RXE_MAX_PD, + .max_elem = RXE_MAX_PD, }, [RXE_TYPE_AH] = { .name = "ah", @@ -40,7 +40,7 @@ static const struct rxe_type_info { .elem_offset = offsetof(struct rxe_ah, elem), .min_index = RXE_MIN_AH_INDEX, .max_index = RXE_MAX_AH_INDEX, - .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1, + .max_elem = RXE_MAX_AH, }, [RXE_TYPE_SRQ] = { .name = "srq", @@ -49,7 +49,7 @@ static const struct rxe_type_info { .cleanup = rxe_srq_cleanup, .min_index = RXE_MIN_SRQ_INDEX, .max_index = RXE_MAX_SRQ_INDEX, - .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, + .max_elem = RXE_MAX_SRQ, }, [RXE_TYPE_QP] = { .name = "qp", @@ -58,7 +58,7 @@ static const struct rxe_type_info { .cleanup = rxe_qp_cleanup, .min_index = RXE_MIN_QP_INDEX, .max_index = RXE_MAX_QP_INDEX, - .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1, + .max_elem = RXE_MAX_QP, }, [RXE_TYPE_CQ] = { .name = "cq", @@ -66,8 +66,8 @@ static const struct rxe_type_info { .elem_offset = offsetof(struct rxe_cq, elem), .cleanup = rxe_cq_cleanup, .min_index = 1, - .max_index = UINT_MAX, - .max_elem = UINT_MAX, + .max_index = RXE_MAX_CQ, + .max_elem = RXE_MAX_CQ, }, [RXE_TYPE_MR] = { .name = "mr", @@ -76,7 +76,7 @@ static const struct rxe_type_info { .cleanup = rxe_mr_cleanup, .min_index = RXE_MIN_MR_INDEX, .max_index = RXE_MAX_MR_INDEX, - .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, + .max_elem = RXE_MAX_MR, }, [RXE_TYPE_MW] = { .name = "mw", @@ -85,7 +85,7 @@ static const struct rxe_type_info { .cleanup = rxe_mw_cleanup, .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, - .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1, + .max_elem = RXE_MAX_MW, }, }; From 1aefe5c177c1922119afb4ee443ddd6ac3140b37 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Tue, 20 Dec 2022 17:08:48 +0900 Subject: [PATCH 2/9] RDMA/rxe: Prevent faulty rkey generation If you create MRs more than 0x10000 times after loading the module, responder starts to reply NAKs for RDMA/Atomic operations because of rkey violation detected in check_rkey(). The root cause is that rkeys are incremented each time a new MR is created and the value overflows into the range reserved for MWs. This commit also increases the value of RXE_MAX_MW that has been limited unlike other parameters. Fixes: 0994a1bcd5f7 ("RDMA/rxe: Bump up default maximum values used via uverbs") Link: https://lore.kernel.org/r/20221220080848.253785-2-matsuda-daisuke@fujitsu.com Signed-off-by: Daisuke Matsuda Tested-by: Li Zhijian Reviewed-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_param.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index a754fc902e3d..7b41d79e72b2 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -98,11 +98,11 @@ enum rxe_device_param { RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX, RXE_MIN_MR_INDEX = 0x00000001, - RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE, - RXE_MAX_MR = DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX, - RXE_MIN_MW_INDEX = 0x00010001, - RXE_MAX_MW_INDEX = 0x00020000, - RXE_MAX_MW = 0x00001000, + RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1, + RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX, + RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1, + RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE, + RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX, RXE_MAX_PKT_PER_ACK = 64, From 0afec5e9cea732cb47014655685a2a47fb180c31 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 9 Jan 2023 13:37:11 +0000 Subject: [PATCH 3/9] RDMA/core: Fix ib block iterator counter overflow When registering a new DMA MR after selecting the best aligned page size for it, we iterate over the given sglist to split each entry to smaller, aligned to the selected page size, DMA blocks. In given circumstances where the sg entry and page size fit certain sizes and the sg entry is not aligned to the selected page size, the total size of the aligned pages we need to cover the sg entry is >= 4GB. Under this circumstances, while iterating page aligned blocks, the counter responsible for counting how much we advanced from the start of the sg entry is overflowed because its type is u32 and we pass 4GB in size. This can lead to an infinite loop inside the iterator function because the overflow prevents the counter to be larger than the size of the sg entry. Fix the presented problem by changing the advancement condition to eliminate overflow. Backtrace: [ 192.374329] efa_reg_user_mr_dmabuf [ 192.376783] efa_register_mr [ 192.382579] pgsz_bitmap 0xfffff000 rounddown 0x80000000 [ 192.386423] pg_sz [0x80000000] umem_length[0xc0000000] [ 192.392657] start 0x0 length 0xc0000000 params.page_shift 31 params.page_num 3 [ 192.399559] hp_cnt[3], pages_in_hp[524288] [ 192.403690] umem->sgt_append.sgt.nents[1] [ 192.407905] number entries: [1], pg_bit: [31] [ 192.411397] biter->__sg_nents [1] biter->__sg [0000000008b0c5d8] [ 192.415601] biter->__sg_advance [665837568] sg_dma_len[3221225472] [ 192.419823] biter->__sg_nents [1] biter->__sg [0000000008b0c5d8] [ 192.423976] biter->__sg_advance [2813321216] sg_dma_len[3221225472] [ 192.428243] biter->__sg_nents [1] biter->__sg [0000000008b0c5d8] [ 192.432397] biter->__sg_advance [665837568] sg_dma_len[3221225472] Fixes: a808273a495c ("RDMA/verbs: Add a DMA iterator to return aligned contiguous memory blocks") Signed-off-by: Yonatan Nachum Link: https://lore.kernel.org/r/20230109133711.13678-1-ynachum@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 26b021f43ba4..11b1c1603aeb 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2957,15 +2957,18 @@ EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; + unsigned int sg_delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); - biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset; + sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; - if (biter->__sg_advance >= sg_dma_len(biter->__sg)) { + if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { + biter->__sg_advance += sg_delta; + } else { biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; From 0a0a6e80472c98947d73c3d13bcd7d101895f55d Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Jan 2023 12:31:11 -0500 Subject: [PATCH 4/9] IB/hfi1: Reject a zero-length user expected buffer A zero length user buffer makes no sense and the code does not handle it correctly. Instead, reject a zero length as invalid. Fixes: 97736f36dbeb ("IB/hfi1: Validate page aligned for a given virtual addres") Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/167328547120.1472310.6362802432127399257.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 186d30291260..3c609b11e71c 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -256,6 +256,8 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, if (!PAGE_ALIGNED(tinfo->vaddr)) return -EINVAL; + if (tinfo->length == 0) + return -EINVAL; tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); if (!tidbuf) From ecf91551cdd2925ed6d9a9d99074fa5f67b90596 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Jan 2023 12:31:16 -0500 Subject: [PATCH 5/9] IB/hfi1: Reserve user expected TIDs To avoid a race, reserve the number of user expected TIDs before setup. Fixes: 7e7a436ecb6e ("staging/hfi1: Add TID entry program function body") Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/167328547636.1472310.7419712824785353905.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 3c609b11e71c..d7487555d109 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -282,16 +282,13 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, /* Find sets of physically contiguous pages */ tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); - /* - * We don't need to access this under a lock since tid_used is per - * process and the same process cannot be in hfi1_user_exp_rcv_clear() - * and hfi1_user_exp_rcv_setup() at the same time. - */ + /* Reserve the number of expected tids to be used. */ spin_lock(&fd->tid_lock); if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) pageset_count = fd->tid_limit - fd->tid_used; else pageset_count = tidbuf->n_psets; + fd->tid_used += pageset_count; spin_unlock(&fd->tid_lock); if (!pageset_count) @@ -400,10 +397,11 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, nomem: hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); + /* adjust reserved tid_used to actual count */ + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count - tididx; + spin_unlock(&fd->tid_lock); if (tididx) { - spin_lock(&fd->tid_lock); - fd->tid_used += tididx; - spin_unlock(&fd->tid_lock); tinfo->tidcnt = tididx; tinfo->length = mapped_pages * PAGE_SIZE; From e0c4a422f5246abefbf7c178ef99a1f2dc3c5f62 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Jan 2023 12:31:21 -0500 Subject: [PATCH 6/9] IB/hfi1: Fix expected receive setup error exit issues Fix three error exit issues in expected receive setup. Re-arrange error exits to increase readability. Issues and fixes: 1. Possible missed page unpin if tidlist copyout fails and not all pinned pages where made part of a TID. Fix: Unpin the unused pages. 2. Return success with unset return values tidcnt and length when no pages were pinned. Fix: Return -ENOSPC if no pages were pinned. 3. Return success with unset return values tidcnt and length when no rcvarray entries available. Fix: Return -ENOSPC if no rcvarray entries are available. Fixes: 7e7a436ecb6e ("staging/hfi1: Add TID entry program function body") Fixes: 97736f36dbeb ("IB/hfi1: Validate page aligned for a given virtual addres") Fixes: f404ca4c7ea8 ("IB/hfi1: Refactor hfi_user_exp_rcv_setup() IOCTL") Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/167328548150.1472310.1492305874804187634.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 87 ++++++++++++++--------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index d7487555d109..88df8ca4bb57 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -268,15 +268,14 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), GFP_KERNEL); if (!tidbuf->psets) { - kfree(tidbuf); - return -ENOMEM; + ret = -ENOMEM; + goto fail_release_mem; } pinned = pin_rcv_pages(fd, tidbuf); if (pinned <= 0) { - kfree(tidbuf->psets); - kfree(tidbuf); - return pinned; + ret = (pinned < 0) ? pinned : -ENOSPC; + goto fail_unpin; } /* Find sets of physically contiguous pages */ @@ -291,14 +290,16 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, fd->tid_used += pageset_count; spin_unlock(&fd->tid_lock); - if (!pageset_count) - goto bail; + if (!pageset_count) { + ret = -ENOSPC; + goto fail_unreserve; + } ngroups = pageset_count / dd->rcv_entries.group_size; tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); if (!tidlist) { ret = -ENOMEM; - goto nomem; + goto fail_unreserve; } tididx = 0; @@ -394,44 +395,60 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, } unlock: mutex_unlock(&uctxt->exp_mutex); -nomem: hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); + + /* fail if nothing was programmed, set error if none provided */ + if (tididx == 0) { + if (ret >= 0) + ret = -ENOSPC; + goto fail_unreserve; + } + /* adjust reserved tid_used to actual count */ spin_lock(&fd->tid_lock); fd->tid_used -= pageset_count - tididx; spin_unlock(&fd->tid_lock); - if (tididx) { - tinfo->tidcnt = tididx; - tinfo->length = mapped_pages * PAGE_SIZE; - if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), - tidlist, sizeof(tidlist[0]) * tididx)) { - /* - * On failure to copy to the user level, we need to undo - * everything done so far so we don't leak resources. - */ - tinfo->tidlist = (unsigned long)&tidlist; - hfi1_user_exp_rcv_clear(fd, tinfo); - tinfo->tidlist = 0; - ret = -EFAULT; - goto bail; - } + /* unpin all pages not covered by a TID */ + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, + false); + + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), + tidlist, sizeof(tidlist[0]) * tididx)) { + ret = -EFAULT; + goto fail_unprogram; } - /* - * If not everything was mapped (due to insufficient RcvArray entries, - * for example), unpin all unmapped pages so we can pin them nex time. - */ - if (mapped_pages != pinned) - unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, - (pinned - mapped_pages), false); -bail: - kfree(tidbuf->psets); - kfree(tidlist); kfree(tidbuf->pages); + kfree(tidbuf->psets); kfree(tidbuf); - return ret > 0 ? 0 : ret; + kfree(tidlist); + return 0; + +fail_unprogram: + /* unprogram, unmap, and unpin all allocated TIDs */ + tinfo->tidlist = (unsigned long)tidlist; + hfi1_user_exp_rcv_clear(fd, tinfo); + tinfo->tidlist = 0; + pinned = 0; /* nothing left to unpin */ + pageset_count = 0; /* nothing left reserved */ +fail_unreserve: + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count; + spin_unlock(&fd->tid_lock); +fail_unpin: + if (pinned > 0) + unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); +fail_release_mem: + kfree(tidbuf->pages); + kfree(tidbuf->psets); + kfree(tidbuf); + kfree(tidlist); + return ret; } int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, From 1c7edde1b5720ddb0aff5ca8c7f605a0f92526eb Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Jan 2023 12:31:26 -0500 Subject: [PATCH 7/9] IB/hfi1: Immediately remove invalid memory from hardware When a user expected receive page is unmapped, it should be immediately removed from hardware rather than depend on a reaction from user space. Fixes: 2677a7680e77 ("IB/hfi1: Fix memory leak during unexpected shutdown") Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/167328548663.1472310.7871808081861622659.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 43 +++++++++++++++-------- drivers/infiniband/hw/hfi1/user_exp_rcv.h | 1 + 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 88df8ca4bb57..f402af1e2903 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -28,8 +28,9 @@ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, unsigned int start, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped); -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp); +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); +static void __clear_tid_node(struct hfi1_filedata *fd, + struct tid_rb_node *node); static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); static const struct mmu_interval_notifier_ops tid_mn_ops = { @@ -469,7 +470,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, mutex_lock(&uctxt->exp_mutex); for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { - ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); + ret = unprogram_rcvarray(fd, tidinfo[tididx]); if (ret) { hfi1_cdbg(TID, "Failed to unprogram rcv array %d", ret); @@ -723,6 +724,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, } node->fdata = fd; + mutex_init(&node->invalidate_mutex); node->phys = page_to_phys(pages[0]); node->npages = npages; node->rcventry = rcventry; @@ -762,8 +764,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, return -EFAULT; } -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp) +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; @@ -786,9 +787,6 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (grp) - *grp = node->grp; - if (fd->use_mn) mmu_interval_notifier_remove(&node->notifier); cacheless_tid_rb_remove(fd, node); @@ -796,23 +794,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, return 0; } -static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; + mutex_lock(&node->invalidate_mutex); + if (node->freed) + goto done; + node->freed = true; + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, node->npages, node->notifier.interval_tree.start, node->phys, node->dma_addr); - /* - * Make sure device has seen the write before we unpin the - * pages. - */ + /* Make sure device has seen the write before pages are unpinned */ hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); +done: + mutex_unlock(&node->invalidate_mutex); +} + +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + __clear_tid_node(fd, node); node->grp->used--; node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); @@ -871,10 +880,16 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, if (node->freed) return true; + /* take action only if unmapping */ + if (range->event != MMU_NOTIFY_UNMAP) + return true; + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->notifier.interval_tree.start, node->rcventry, node->npages, node->dma_addr); - node->freed = true; + + /* clear the hardware rcvarray entry */ + __clear_tid_node(fdata, node); spin_lock(&fdata->invalid_lock); if (fdata->invalid_tid_idx < uctxt->expected_count) { diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index 8c53e416bf84..2ddb3dac7d91 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -27,6 +27,7 @@ struct tid_user_buf { struct tid_rb_node { struct mmu_interval_notifier notifier; struct hfi1_filedata *fdata; + struct mutex invalidate_mutex; /* covers hw removal */ unsigned long phys; struct tid_group *grp; u32 rcventry; From b3deec25847bda34e34d5d7be02f633caf000bd8 Mon Sep 17 00:00:00 2001 From: Dean Luick Date: Mon, 9 Jan 2023 12:31:31 -0500 Subject: [PATCH 8/9] IB/hfi1: Remove user expected buffer invalidate race During setup, there is a possible race between a page invalidate and hardware programming. Add a covering invalidate over the user target range during setup. If anything within that range is invalidated during setup, fail the setup. Once set up, each TID will have its own invalidate callback and invalidate. Fixes: 3889551db212 ("RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv") Signed-off-by: Dean Luick Signed-off-by: Dennis Dalessandro Link: https://lore.kernel.org/r/167328549178.1472310.9867497376936699488.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 58 +++++++++++++++++++++-- drivers/infiniband/hw/hfi1/user_exp_rcv.h | 2 + 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index f402af1e2903..b02f2f0809c8 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -23,6 +23,9 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq); +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, struct tid_group *grp, unsigned int start, u16 count, @@ -36,6 +39,9 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); static const struct mmu_interval_notifier_ops tid_mn_ops = { .invalidate = tid_rb_invalidate, }; +static const struct mmu_interval_notifier_ops tid_cover_ops = { + .invalidate = tid_cover_invalidate, +}; /* * Initialize context and file private data needed for Expected @@ -254,6 +260,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tididx = 0, mapped, mapped_pages = 0; u32 *tidlist = NULL; struct tid_user_buf *tidbuf; + unsigned long mmu_seq = 0; if (!PAGE_ALIGNED(tinfo->vaddr)) return -EINVAL; @@ -264,6 +271,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, if (!tidbuf) return -ENOMEM; + mutex_init(&tidbuf->cover_mutex); tidbuf->vaddr = tinfo->vaddr; tidbuf->length = tinfo->length; tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), @@ -273,6 +281,16 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, goto fail_release_mem; } + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &tidbuf->notifier, current->mm, + tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, + &tid_cover_ops); + if (ret) + goto fail_release_mem; + mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); + } + pinned = pin_rcv_pages(fd, tidbuf); if (pinned <= 0) { ret = (pinned < 0) ? pinned : -ENOSPC; @@ -415,6 +433,20 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, false); + if (fd->use_mn) { + /* check for an invalidate during setup */ + bool fail = false; + + mutex_lock(&tidbuf->cover_mutex); + fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); + mutex_unlock(&tidbuf->cover_mutex); + + if (fail) { + ret = -EBUSY; + goto fail_unprogram; + } + } + tinfo->tidcnt = tididx; tinfo->length = mapped_pages * PAGE_SIZE; @@ -424,6 +456,8 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, goto fail_unprogram; } + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); kfree(tidbuf->pages); kfree(tidbuf->psets); kfree(tidbuf); @@ -442,6 +476,8 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, fd->tid_used -= pageset_count; spin_unlock(&fd->tid_lock); fail_unpin: + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); if (pinned > 0) unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); fail_release_mem: @@ -740,11 +776,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, &tid_mn_ops); if (ret) goto out_unmap; - /* - * FIXME: This is in the wrong order, the notifier should be - * established before the pages are pinned by pin_rcv_pages. - */ - mmu_interval_read_begin(&node->notifier); } fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; @@ -919,6 +950,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, return true; } +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct tid_user_buf *tidbuf = + container_of(mni, struct tid_user_buf, notifier); + + /* take action only if unmapping */ + if (range->event == MMU_NOTIFY_UNMAP) { + mutex_lock(&tidbuf->cover_mutex); + mmu_interval_set_seq(mni, cur_seq); + mutex_unlock(&tidbuf->cover_mutex); + } + + return true; +} + static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, struct tid_rb_node *tnode) { diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index 2ddb3dac7d91..f8ee997d0050 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -16,6 +16,8 @@ struct tid_pageset { }; struct tid_user_buf { + struct mmu_interval_notifier notifier; + struct mutex cover_mutex; unsigned long vaddr; unsigned long length; unsigned int npages; From 0f097f08c9b3c1fdb6cc9f2dd423abc17d13f1a2 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 11 Jan 2023 12:10:54 +0200 Subject: [PATCH 9/9] lib/scatterlist: Fix to calculate the last_pg properly The last_pg is wrong, it is actually the first page of the last scatterlist element. To get the last page of the last scatterlist element we have to add prv->length. So it is checking mergability against the wrong page, Further, a SG element is not guaranteed to end on a page boundary, so we have to check the sub page location also for merge eligibility. Fix the above by checking physical contiguity based on PFNs, compute the actual last page and then call pages_are_mergable(). Fixes: 1567b49d1a40 ("lib/scatterlist: add check when merging zone device pages") Link: https://lore.kernel.org/r/20230111101054.188136-1-yishaih@nvidia.com Reported-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jason Gunthorpe --- lib/scatterlist.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index f72aa50c6654..8d7519a8f308 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -470,22 +470,27 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, return -EOPNOTSUPP; if (sgt_append->prv) { + unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) + + sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE; + if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; - last_pg = sg_page(sgt_append->prv); - while (n_pages && pages_are_mergeable(pages[0], last_pg)) { - if (sgt_append->prv->length + PAGE_SIZE > max_segment) - break; - sgt_append->prv->length += PAGE_SIZE; - last_pg = pages[0]; - pages++; - n_pages--; + if (page_to_pfn(pages[0]) == next_pfn) { + last_pg = pfn_to_page(next_pfn - 1); + while (n_pages && pages_are_mergeable(pages[0], last_pg)) { + if (sgt_append->prv->length + PAGE_SIZE > max_segment) + break; + sgt_append->prv->length += PAGE_SIZE; + last_pg = pages[0]; + pages++; + n_pages--; + } + if (!n_pages) + goto out; } - if (!n_pages) - goto out; } /* compute number of contiguous chunks */