mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-02 09:21:19 -04:00
Merge branch 'mlx5-misc-fixes-2026-02-18'
Tariq Toukan says: ==================== mlx5 misc fixes 2026-02-18 This patchset provides misc bug fixes from the team to the mlx5 core and Eth drivers. ==================== Link: https://patch.msgid.link/20260218072904.1764634-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -180,7 +180,8 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
|
||||
}
|
||||
|
||||
/* Use this function to get max num channels (rxqs/txqs) only to create netdev */
|
||||
static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
|
||||
static inline unsigned int
|
||||
mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
|
||||
{
|
||||
return is_kdump_kernel() ?
|
||||
MLX5E_MIN_NUM_CHANNELS :
|
||||
|
||||
@@ -457,22 +457,8 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work)
|
||||
{
|
||||
struct mlx5e_ptpsq *ptpsq =
|
||||
container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
|
||||
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
|
||||
|
||||
/* Recovering the PTP SQ means re-enabling NAPI, which requires the
|
||||
* netdev instance lock. However, SQ closing has to wait for this work
|
||||
* task to finish while also holding the same lock. So either get the
|
||||
* lock or find that the SQ is no longer enabled and thus this work is
|
||||
* not relevant anymore.
|
||||
*/
|
||||
while (!netdev_trylock(sq->netdev)) {
|
||||
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
|
||||
return;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
|
||||
netdev_unlock(sq->netdev);
|
||||
}
|
||||
|
||||
static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// Copyright (c) 2019 Mellanox Technologies.
|
||||
|
||||
#include <net/netdev_lock.h>
|
||||
|
||||
#include "health.h"
|
||||
#include "params.h"
|
||||
#include "txrx.h"
|
||||
@@ -177,6 +179,16 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
|
||||
rq = ctx;
|
||||
priv = rq->priv;
|
||||
|
||||
/* Acquire netdev instance lock to synchronize with channel close and
|
||||
* reopen flows. Either successfully obtain the lock, or detect that
|
||||
* channels are closing for another reason, making this work no longer
|
||||
* necessary.
|
||||
*/
|
||||
while (!netdev_trylock(rq->netdev)) {
|
||||
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
|
||||
return 0;
|
||||
msleep(20);
|
||||
}
|
||||
mutex_lock(&priv->state_lock);
|
||||
|
||||
eq = rq->cq.mcq.eq;
|
||||
@@ -186,6 +198,7 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
|
||||
clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state);
|
||||
|
||||
mutex_unlock(&priv->state_lock);
|
||||
netdev_unlock(rq->netdev);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Copyright (c) 2019 Mellanox Technologies. */
|
||||
|
||||
#include <net/netdev_lock.h>
|
||||
|
||||
#include "health.h"
|
||||
#include "en/ptp.h"
|
||||
#include "en/devlink.h"
|
||||
@@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
|
||||
if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
|
||||
return 0;
|
||||
|
||||
/* Recovering queues means re-enabling NAPI, which requires the netdev
|
||||
* instance lock. However, SQ closing flows have to wait for work tasks
|
||||
* to finish while also holding the netdev instance lock. So either get
|
||||
* the lock or find that the SQ is no longer enabled and thus this work
|
||||
* is not relevant anymore.
|
||||
*/
|
||||
while (!netdev_trylock(dev)) {
|
||||
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
|
||||
return 0;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
|
||||
if (err) {
|
||||
netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
|
||||
@@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
|
||||
else
|
||||
mlx5e_trigger_napi_sched(sq->cq.napi);
|
||||
|
||||
netdev_unlock(dev);
|
||||
return 0;
|
||||
out:
|
||||
clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
|
||||
netdev_unlock(dev);
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -137,10 +153,24 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
|
||||
sq = to_ctx->sq;
|
||||
eq = sq->cq.mcq.eq;
|
||||
priv = sq->priv;
|
||||
|
||||
/* Recovering the TX queues implies re-enabling NAPI, which requires
|
||||
* the netdev instance lock.
|
||||
* However, channel closing flows have to wait for this work to finish
|
||||
* while holding the same lock. So either get the lock or find that
|
||||
* channels are being closed for other reason and this work is not
|
||||
* relevant anymore.
|
||||
*/
|
||||
while (!netdev_trylock(sq->netdev)) {
|
||||
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
|
||||
return 0;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
|
||||
if (!err) {
|
||||
to_ctx->status = 0; /* this sq recovered */
|
||||
return err;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&priv->state_lock);
|
||||
@@ -148,7 +178,7 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
|
||||
mutex_unlock(&priv->state_lock);
|
||||
if (!err) {
|
||||
to_ctx->status = 1; /* all channels recovered */
|
||||
return err;
|
||||
goto out;
|
||||
}
|
||||
|
||||
to_ctx->status = err;
|
||||
@@ -156,7 +186,8 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
|
||||
netdev_err(priv->netdev,
|
||||
"mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
|
||||
err);
|
||||
|
||||
out:
|
||||
netdev_unlock(sq->netdev);
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
|
||||
return 0;
|
||||
|
||||
priv = ptpsq->txqsq.priv;
|
||||
netdev = priv->netdev;
|
||||
|
||||
/* Recovering the PTP SQ means re-enabling NAPI, which requires the
|
||||
* netdev instance lock. However, SQ closing has to wait for this work
|
||||
* task to finish while also holding the same lock. So either get the
|
||||
* lock or find that the SQ is no longer enabled and thus this work is
|
||||
* not relevant anymore.
|
||||
*/
|
||||
while (!netdev_trylock(netdev)) {
|
||||
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
|
||||
return 0;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
mutex_lock(&priv->state_lock);
|
||||
chs = &priv->channels;
|
||||
netdev = priv->netdev;
|
||||
|
||||
carrier_ok = netif_carrier_ok(netdev);
|
||||
netif_carrier_off(netdev);
|
||||
@@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
|
||||
netif_carrier_on(netdev);
|
||||
|
||||
mutex_unlock(&priv->state_lock);
|
||||
netdev_unlock(netdev);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
|
||||
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
|
||||
#include <linux/iopoll.h>
|
||||
#include <linux/math64.h>
|
||||
#include "lib/aso.h"
|
||||
#include "en/tc/post_act.h"
|
||||
@@ -115,7 +116,6 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
|
||||
struct mlx5e_flow_meters *flow_meters;
|
||||
u8 cir_man, cir_exp, cbs_man, cbs_exp;
|
||||
struct mlx5_aso_wqe *aso_wqe;
|
||||
unsigned long expires;
|
||||
struct mlx5_aso *aso;
|
||||
u64 rate, burst;
|
||||
u8 ds_cnt;
|
||||
@@ -187,12 +187,8 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
|
||||
mlx5_aso_post_wqe(aso, true, &aso_wqe->ctrl);
|
||||
|
||||
/* With newer FW, the wait for the first ASO WQE is more than 2us, put the wait 10ms. */
|
||||
expires = jiffies + msecs_to_jiffies(10);
|
||||
do {
|
||||
err = mlx5_aso_poll_cq(aso, true);
|
||||
if (err)
|
||||
usleep_range(2, 10);
|
||||
} while (err && time_is_after_jiffies(expires));
|
||||
read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
|
||||
false, aso, true);
|
||||
mutex_unlock(&flow_meters->aso_lock);
|
||||
|
||||
return err;
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <linux/mlx5/mlx5_ifc.h>
|
||||
#include <linux/xarray.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/iopoll.h>
|
||||
|
||||
#include "en.h"
|
||||
#include "lib/aso.h"
|
||||
@@ -1385,7 +1386,8 @@ static int macsec_aso_set_arm_event(struct mlx5_core_dev *mdev, struct mlx5e_mac
|
||||
MLX5_ACCESS_ASO_OPC_MOD_MACSEC);
|
||||
macsec_aso_build_ctrl(aso, &aso_wqe->aso_ctrl, in);
|
||||
mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl);
|
||||
err = mlx5_aso_poll_cq(maso, false);
|
||||
read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
|
||||
false, maso, false);
|
||||
mutex_unlock(&aso->aso_lock);
|
||||
|
||||
return err;
|
||||
@@ -1397,7 +1399,6 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac
|
||||
struct mlx5e_macsec_aso *aso;
|
||||
struct mlx5_aso_wqe *aso_wqe;
|
||||
struct mlx5_aso *maso;
|
||||
unsigned long expires;
|
||||
int err;
|
||||
|
||||
aso = &macsec->aso;
|
||||
@@ -1411,12 +1412,8 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac
|
||||
macsec_aso_build_wqe_ctrl_seg(aso, &aso_wqe->aso_ctrl, NULL);
|
||||
|
||||
mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl);
|
||||
expires = jiffies + msecs_to_jiffies(10);
|
||||
do {
|
||||
err = mlx5_aso_poll_cq(maso, false);
|
||||
if (err)
|
||||
usleep_range(2, 10);
|
||||
} while (err && time_is_after_jiffies(expires));
|
||||
read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
|
||||
false, maso, false);
|
||||
|
||||
if (err)
|
||||
goto err_out;
|
||||
|
||||
@@ -631,19 +631,7 @@ static void mlx5e_rq_timeout_work(struct work_struct *timeout_work)
|
||||
struct mlx5e_rq,
|
||||
rx_timeout_work);
|
||||
|
||||
/* Acquire netdev instance lock to synchronize with channel close and
|
||||
* reopen flows. Either successfully obtain the lock, or detect that
|
||||
* channels are closing for another reason, making this work no longer
|
||||
* necessary.
|
||||
*/
|
||||
while (!netdev_trylock(rq->netdev)) {
|
||||
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
|
||||
return;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
mlx5e_reporter_rx_timeout(rq);
|
||||
netdev_unlock(rq->netdev);
|
||||
}
|
||||
|
||||
static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq)
|
||||
@@ -1952,20 +1940,7 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
|
||||
struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
|
||||
recover_work);
|
||||
|
||||
/* Recovering queues means re-enabling NAPI, which requires the netdev
|
||||
* instance lock. However, SQ closing flows have to wait for work tasks
|
||||
* to finish while also holding the netdev instance lock. So either get
|
||||
* the lock or find that the SQ is no longer enabled and thus this work
|
||||
* is not relevant anymore.
|
||||
*/
|
||||
while (!netdev_trylock(sq->netdev)) {
|
||||
if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
|
||||
return;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
mlx5e_reporter_tx_err_cqe(sq);
|
||||
netdev_unlock(sq->netdev);
|
||||
}
|
||||
|
||||
static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
|
||||
@@ -5115,19 +5090,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
|
||||
struct net_device *netdev = priv->netdev;
|
||||
int i;
|
||||
|
||||
/* Recovering the TX queues implies re-enabling NAPI, which requires
|
||||
* the netdev instance lock.
|
||||
* However, channel closing flows have to wait for this work to finish
|
||||
* while holding the same lock. So either get the lock or find that
|
||||
* channels are being closed for other reason and this work is not
|
||||
* relevant anymore.
|
||||
*/
|
||||
while (!netdev_trylock(netdev)) {
|
||||
if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
|
||||
return;
|
||||
msleep(20);
|
||||
}
|
||||
|
||||
for (i = 0; i < netdev->real_num_tx_queues; i++) {
|
||||
struct netdev_queue *dev_queue =
|
||||
netdev_get_tx_queue(netdev, i);
|
||||
@@ -5140,8 +5102,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
|
||||
/* break if tried to reopened channels */
|
||||
break;
|
||||
}
|
||||
|
||||
netdev_unlock(netdev);
|
||||
}
|
||||
|
||||
static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
|
||||
#include <linux/io.h>
|
||||
#include <linux/iopoll.h>
|
||||
#include <linux/mlx5/transobj.h>
|
||||
#include "lib/clock.h"
|
||||
#include "mlx5_core.h"
|
||||
@@ -15,7 +16,7 @@
|
||||
#define TEST_WC_NUM_WQES 255
|
||||
#define TEST_WC_LOG_CQ_SZ (order_base_2(TEST_WC_NUM_WQES))
|
||||
#define TEST_WC_SQ_LOG_WQ_SZ TEST_WC_LOG_CQ_SZ
|
||||
#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
|
||||
#define TEST_WC_POLLING_MAX_TIME_USEC (100 * USEC_PER_MSEC)
|
||||
|
||||
struct mlx5_wc_cq {
|
||||
/* data path - accessed per cqe */
|
||||
@@ -359,7 +360,6 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq)
|
||||
static void mlx5_core_test_wc(struct mlx5_core_dev *mdev)
|
||||
{
|
||||
unsigned int offset = 0;
|
||||
unsigned long expires;
|
||||
struct mlx5_wc_sq *sq;
|
||||
int i, err;
|
||||
|
||||
@@ -389,13 +389,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev)
|
||||
|
||||
mlx5_wc_post_nop(sq, &offset, true);
|
||||
|
||||
expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
|
||||
do {
|
||||
err = mlx5_wc_poll_cq(sq);
|
||||
if (err)
|
||||
usleep_range(2, 10);
|
||||
} while (mdev->wc_state == MLX5_WC_STATE_UNINITIALIZED &&
|
||||
time_is_after_jiffies(expires));
|
||||
poll_timeout_us(mlx5_wc_poll_cq(sq),
|
||||
mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED, 10,
|
||||
TEST_WC_POLLING_MAX_TIME_USEC, false);
|
||||
|
||||
mlx5_wc_destroy_sq(sq);
|
||||
|
||||
|
||||
@@ -1282,12 +1282,12 @@ static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
|
||||
static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev)
|
||||
{
|
||||
return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) &&
|
||||
MLX5_CAP_GEN(dev, num_vhca_ports) <= 1;
|
||||
MLX5_CAP_GEN_MAX(dev, num_vhca_ports) <= 1;
|
||||
}
|
||||
|
||||
static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev)
|
||||
{
|
||||
return MLX5_CAP_GEN(dev, num_vhca_ports) > 1;
|
||||
return MLX5_CAP_GEN_MAX(dev, num_vhca_ports) > 1;
|
||||
}
|
||||
|
||||
static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev)
|
||||
|
||||
Reference in New Issue
Block a user