Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma fixes from Jason Gunthorpe:

 - Fix a mlx5 malfunction if the UMR QP gets an error

 - Return the correct port number to userspace for a mlx5 DCT

 - Don't cause a UMR QP error if DMABUF teardown races with invalidation

 - Fix a WARN splat when unregisering so mlx5 device memory MR types

 - Use the correct alignment for the mana doorbell so that two processes
   do not share the same physical page on non-4k page systems

 - MAINTAINERS updates for MANA

 - Retry failed HNS FW commands because some can take a long time

 - Cast void * handle to the correct type in bnxt to fix corruption

 - Avoid a NULL pointer crash in bnxt_re

 - Fix skipped ib_device_unregsiter() for bnxt_re due to some earlier
   rework

 - Correctly detect if the bnxt supports extended statistics

 - Fix refcount leak in mlx5 odp introduced by a previous fix

 - Map the FW result for the port rate to the userspace values properly
   in mlx5, returns correct values for newer 800G ports

 - Don't wrongly destroy counters objects that were not automatically
   created during mlx5 bind qp

 - Set page size/shift members of kernel owned SRQs to fix a crash in
   nvme target

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  RDMA/bnxt_re: Fix the page details for the srq created by kernel consumers
  RDMA/mlx5: Fix bind QP error cleanup flow
  RDMA/mlx5: Fix AH static rate parsing
  RDMA/mlx5: Fix implicit ODP hang on parent deregistration
  RDMA/bnxt_re: Fix the statistics for Gen P7 VF
  RDMA/bnxt_re: Fix issue in the unload path
  RDMA/bnxt_re: Add sanity checks on rdev validity
  RDMA/bnxt_re: Fix an issue in bnxt_re_async_notifier
  RDMA/hns: Fix mbox timing out by adding retry mechanism
  MAINTAINERS: update maintainer for Microsoft MANA RDMA driver
  RDMA/mana_ib: Allocate PAGE aligned doorbell index
  RDMA/mlx5: Fix a WARN during dereg_mr for DM type
  RDMA/mlx5: Fix a race for DMABUF MR which can lead to CQE with error
  IB/mlx5: Set and get correct qp_num for a DCT QP
  RDMA/mlx5: Fix the recovery flow of the UMR QP
This commit is contained in:
Linus Torvalds
2025-02-25 13:43:03 -08:00
16 changed files with 161 additions and 68 deletions

View File

@@ -15680,7 +15680,7 @@ F: include/uapi/linux/cciss*.h
MICROSOFT MANA RDMA DRIVER
M: Long Li <longli@microsoft.com>
M: Ajay Sharma <sharmaajay@microsoft.com>
M: Konstantin Taranov <kotaranov@microsoft.com>
L: linux-rdma@vger.kernel.org
S: Supported
F: drivers/infiniband/hw/mana/

View File

@@ -187,7 +187,6 @@ struct bnxt_re_dev {
#define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29
struct net_device *netdev;
struct auxiliary_device *adev;
struct notifier_block nb;
unsigned int version, major, minor;
struct bnxt_qplib_chip_ctx *chip_ctx;
struct bnxt_en_dev *en_dev;

View File

@@ -348,8 +348,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
goto done;
}
bnxt_re_copy_err_stats(rdev, stats, err_s);
if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags) &&
!rdev->is_virtfn) {
if (bnxt_ext_stats_supported(rdev->chip_ctx, rdev->dev_attr->dev_cap_flags,
rdev->is_virtfn)) {
rc = bnxt_re_get_ext_stat(rdev, stats);
if (rc) {
clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS,

View File

@@ -1870,6 +1870,8 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq,
srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit;
srq->srq_limit = srq_init_attr->attr.srq_limit;
srq->qplib_srq.eventq_hw_ring_id = rdev->nqr->nq[0].ring_id;
srq->qplib_srq.sg_info.pgsize = PAGE_SIZE;
srq->qplib_srq.sg_info.pgshft = PAGE_SHIFT;
nq = &rdev->nqr->nq[0];
if (udata) {

View File

@@ -396,11 +396,16 @@ static void bnxt_re_dcb_wq_task(struct work_struct *work)
static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl)
{
struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle;
struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle);
struct bnxt_re_dcb_work *dcb_work;
struct bnxt_re_dev *rdev;
u32 data1, data2;
u16 event_id;
rdev = en_info->rdev;
if (!rdev)
return;
event_id = le16_to_cpu(cmpl->event_id);
data1 = le32_to_cpu(cmpl->event_data1);
data2 = le32_to_cpu(cmpl->event_data2);
@@ -433,6 +438,8 @@ static void bnxt_re_stop_irq(void *handle, bool reset)
int indx;
rdev = en_info->rdev;
if (!rdev)
return;
rcfw = &rdev->rcfw;
if (reset) {
@@ -461,6 +468,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
int indx, rc;
rdev = en_info->rdev;
if (!rdev)
return;
msix_ent = rdev->nqr->msix_entries;
rcfw = &rdev->rcfw;
if (!ent) {
@@ -1350,7 +1359,6 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev,
return NULL;
}
/* Default values */
rdev->nb.notifier_call = NULL;
rdev->netdev = en_dev->net;
rdev->en_dev = en_dev;
rdev->adev = adev;
@@ -2345,15 +2353,6 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type)
static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type,
struct auxiliary_device *aux_dev)
{
if (rdev->nb.notifier_call) {
unregister_netdevice_notifier(&rdev->nb);
rdev->nb.notifier_call = NULL;
} else {
/* If notifier is null, we should have already done a
* clean up before coming here.
*/
return;
}
bnxt_re_setup_cc(rdev, false);
ib_unregister_device(&rdev->ibdev);
bnxt_re_dev_uninit(rdev, op_type);
@@ -2433,6 +2432,7 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx",
__func__, en_dev->en_state);
bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev);
bnxt_re_update_en_info_rdev(NULL, en_info, adev);
mutex_unlock(&bnxt_re_mutex);
return 0;

View File

@@ -547,6 +547,14 @@ static inline bool _is_ext_stats_supported(u16 dev_cap_flags)
CREQ_QUERY_FUNC_RESP_SB_EXT_STATS;
}
static inline int bnxt_ext_stats_supported(struct bnxt_qplib_chip_ctx *ctx,
u16 flags, bool virtfn)
{
/* ext stats supported if cap flag is set AND is a PF OR a Thor2 VF */
return (_is_ext_stats_supported(flags) &&
((virtfn && bnxt_qplib_is_chip_gen_p7(ctx)) || (!virtfn)));
}
static inline bool _is_hw_retx_supported(u16 dev_cap_flags)
{
return dev_cap_flags &

View File

@@ -1286,10 +1286,8 @@ static u32 hns_roce_cmdq_tx_timeout(u16 opcode, u32 tx_timeout)
return tx_timeout;
}
static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode)
static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u32 tx_timeout)
{
struct hns_roce_v2_priv *priv = hr_dev->priv;
u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout);
u32 timeout = 0;
do {
@@ -1299,8 +1297,9 @@ static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode)
} while (++timeout < tx_timeout);
}
static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc, int num)
static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc,
int num, u32 tx_timeout)
{
struct hns_roce_v2_priv *priv = hr_dev->priv;
struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
@@ -1309,8 +1308,6 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
int ret;
int i;
spin_lock_bh(&csq->lock);
tail = csq->head;
for (i = 0; i < num; i++) {
@@ -1324,22 +1321,17 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_CNT]);
hns_roce_wait_csq_done(hr_dev, le16_to_cpu(desc->opcode));
hns_roce_wait_csq_done(hr_dev, tx_timeout);
if (hns_roce_cmq_csq_done(hr_dev)) {
ret = 0;
for (i = 0; i < num; i++) {
/* check the result of hardware write back */
desc[i] = csq->desc[tail++];
desc_ret = le16_to_cpu(csq->desc[tail++].retval);
if (tail == csq->desc_num)
tail = 0;
desc_ret = le16_to_cpu(desc[i].retval);
if (likely(desc_ret == CMD_EXEC_SUCCESS))
continue;
dev_err_ratelimited(hr_dev->dev,
"Cmdq IO error, opcode = 0x%x, return = 0x%x.\n",
desc->opcode, desc_ret);
ret = hns_roce_cmd_err_convert_errno(desc_ret);
}
} else {
@@ -1354,14 +1346,54 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
ret = -EAGAIN;
}
spin_unlock_bh(&csq->lock);
if (ret)
atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_ERR_CNT]);
return ret;
}
static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc, int num)
{
struct hns_roce_v2_priv *priv = hr_dev->priv;
struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
u16 opcode = le16_to_cpu(desc->opcode);
u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout);
u8 try_cnt = HNS_ROCE_OPC_POST_MB_TRY_CNT;
u32 rsv_tail;
int ret;
int i;
while (try_cnt) {
try_cnt--;
spin_lock_bh(&csq->lock);
rsv_tail = csq->head;
ret = __hns_roce_cmq_send_one(hr_dev, desc, num, tx_timeout);
if (opcode == HNS_ROCE_OPC_POST_MB && ret == -ETIME &&
try_cnt) {
spin_unlock_bh(&csq->lock);
mdelay(HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC);
continue;
}
for (i = 0; i < num; i++) {
desc[i] = csq->desc[rsv_tail++];
if (rsv_tail == csq->desc_num)
rsv_tail = 0;
}
spin_unlock_bh(&csq->lock);
break;
}
if (ret)
dev_err_ratelimited(hr_dev->dev,
"Cmdq IO error, opcode = 0x%x, return = %d.\n",
opcode, ret);
return ret;
}
static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc, int num)
{

View File

@@ -230,6 +230,8 @@ enum hns_roce_opcode_type {
};
#define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000
#define HNS_ROCE_OPC_POST_MB_TRY_CNT 8
#define HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC 5
struct hns_roce_cmdq_tx_timeout_map {
u16 opcode;
u32 tx_timeout;

View File

@@ -174,7 +174,7 @@ static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
req.num_resources = 1;
req.alignment = 1;
req.alignment = PAGE_SIZE / MANA_PAGE_SIZE;
/* Have GDMA start searching from 0 */
req.allocated_resources = 0;

View File

@@ -67,7 +67,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
ah->av.tclass = grh->traffic_class;
}
ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4);
ah->av.stat_rate_sl =
(mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4);
if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
if (init_attr->xmit_slave)

View File

@@ -546,6 +546,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
struct ib_qp *qp)
{
struct mlx5_ib_dev *dev = to_mdev(qp->device);
bool new = false;
int err;
if (!counter->id) {
@@ -560,6 +561,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
return err;
counter->id =
MLX5_GET(alloc_q_counter_out, out, counter_set_id);
new = true;
}
err = mlx5_ib_qp_set_counter(qp, counter);
@@ -569,8 +571,10 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
return 0;
fail_set_counter:
mlx5_ib_counter_dealloc(counter);
counter->id = 0;
if (new) {
mlx5_ib_counter_dealloc(counter);
counter->id = 0;
}
return err;
}

View File

@@ -1550,7 +1550,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
if (!umem_dmabuf->sgt)
if (!umem_dmabuf->sgt || !mr)
return;
mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
@@ -1935,7 +1935,8 @@ mlx5_alloc_priv_descs(struct ib_device *device,
static void
mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
{
if (!mr->umem && !mr->data_direct && mr->descs) {
if (!mr->umem && !mr->data_direct &&
mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
struct ib_device *device = mr->ibmr.device;
int size = mr->max_descs * mr->desc_size;
struct mlx5_ib_dev *dev = to_mdev(device);
@@ -2022,11 +2023,16 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
bool is_odp = is_odp_mr(mr);
bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
!to_ib_umem_dmabuf(mr->umem)->pinned;
int ret = 0;
if (is_odp)
mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
if (is_odp_dma_buf)
dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
ent = mr->mmkey.cache_ent;
/* upon storing to a clean temp entry - schedule its cleanup */
@@ -2054,6 +2060,12 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
}
if (is_odp_dma_buf) {
if (!ret)
to_ib_umem_dmabuf(mr->umem)->private = NULL;
dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
}
return ret;
}

View File

@@ -242,6 +242,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
mr) {
xa_unlock(&imr->implicit_children);
mlx5r_deref_odp_mkey(&imr->mmkey);
return;
}

View File

@@ -3447,11 +3447,11 @@ static int ib_to_mlx5_rate_map(u8 rate)
return 0;
}
static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate)
{
u32 stat_rate_support;
if (rate == IB_RATE_PORT_CURRENT)
if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS)
return 0;
if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS)
@@ -3596,7 +3596,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
sizeof(grh->dgid.raw));
}
err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah));
err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah));
if (err < 0)
return err;
MLX5_SET(ads, path, stat_rate, err);
@@ -4579,6 +4579,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1);
MLX5_SET(dctc, dctc, counter_set_id, set_id);
qp->port = attr->port_num;
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
struct mlx5_ib_modify_qp_resp resp = {};
u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {};
@@ -5074,7 +5076,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp,
}
if (qp_attr_mask & IB_QP_PORT)
qp_attr->port_num = MLX5_GET(dctc, dctc, port);
qp_attr->port_num = mqp->port;
if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak);
if (qp_attr_mask & IB_QP_AV) {

View File

@@ -56,4 +56,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
int mlx5_ib_qp_event_init(void);
void mlx5_ib_qp_event_cleanup(void);
int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate);
#endif /* _MLX5_IB_QP_H */

View File

@@ -231,30 +231,6 @@ void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
ib_dealloc_pd(dev->umrc.pd);
}
static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
{
struct umr_common *umrc = &dev->umrc;
struct ib_qp_attr attr;
int err;
attr.qp_state = IB_QPS_RESET;
err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
if (err) {
mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
goto err;
}
err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
if (err)
goto err;
umrc->state = MLX5_UMR_STATE_ACTIVE;
return 0;
err:
umrc->state = MLX5_UMR_STATE_ERR;
return err;
}
static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
struct mlx5r_umr_wqe *wqe, bool with_data)
@@ -302,6 +278,61 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
return err;
}
static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
struct mlx5r_umr_context *umr_context,
struct mlx5r_umr_wqe *wqe, bool with_data)
{
struct umr_common *umrc = &dev->umrc;
struct ib_qp_attr attr;
int err;
mutex_lock(&umrc->lock);
/* Preventing any further WRs to be sent now */
if (umrc->state != MLX5_UMR_STATE_RECOVER) {
mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
umrc->state);
umrc->state = MLX5_UMR_STATE_RECOVER;
}
mutex_unlock(&umrc->lock);
/* Sending a final/barrier WR (the failed one) and wait for its completion.
* This will ensure that all the previous WRs got a completion before
* we set the QP state to RESET.
*/
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
with_data);
if (err) {
mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
goto err;
}
/* Since the QP is in an error state, it will only receive
* IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
* we don't care about its status.
*/
wait_for_completion(&umr_context->done);
attr.qp_state = IB_QPS_RESET;
err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
if (err) {
mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
goto err;
}
err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
if (err) {
mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
goto err;
}
umrc->state = MLX5_UMR_STATE_ACTIVE;
return 0;
err:
umrc->state = MLX5_UMR_STATE_ERR;
return err;
}
static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct mlx5_ib_umr_context *context =
@@ -366,9 +397,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
mlx5_ib_warn(dev,
"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
umr_context.status, mkey);
mutex_lock(&umrc->lock);
err = mlx5r_umr_recover(dev);
mutex_unlock(&umrc->lock);
err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
if (err)
mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
err);