From 0e22bfb7c046e7c8ae339f396e78a0976633698c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 12 Jan 2021 16:13:22 +0200 Subject: [PATCH 01/15] net/mlx5e: E-switch, Fix rate calculation for overflow rate_bytes_ps is a 64-bit field. It passed as 32-bit field to apply_police_params(). Due to this when police rate is higher than 4Gbps, 32-bit calculation ignores the carry. This results in incorrect rate configurationn the device. Fix it by performing 64-bit calculation. Fixes: fcb64c0f5640 ("net/mlx5: E-Switch, add ingress rate support") Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index dd0bfbacad47..717fbaa6ce73 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -5040,7 +5040,7 @@ static int apply_police_params(struct mlx5e_priv *priv, u64 rate, */ if (rate) { rate = (rate * BITS_PER_BYTE) + 500000; - rate_mbps = max_t(u32, do_div(rate, 1000000), 1); + rate_mbps = max_t(u64, do_div(rate, 1000000), 1); } err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps); From e4484d9df5000a18916e0bbcee50828eac8e293e Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Sun, 24 Jan 2021 22:40:23 +0200 Subject: [PATCH 02/15] net/mlx5e: Enable striding RQ for Connect-X IPsec capable devices This limitation was inherited by previous Innova (FPGA) IPsec implementation, it uses its private set of RQ handlers which does not support striding rq, for Connect-X this is no longer true. Fix by keeping this limitation only for Innova IPsec supporting devices, as otherwise this limitation effectively wrongly blocks striding RQs for all future Connect-X devices for all flows even if IPsec offload is not used. Fixes: 2d64663cd559 ("net/mlx5: IPsec: Add HW crypto offload support") Signed-off-by: Raed Salem Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++-- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h | 2 ++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3fc7d18ac868..0ae22a018dc2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -65,6 +65,7 @@ #include "en/devlink.h" #include "lib/mlx5.h" #include "en/ptp.h" +#include "fpga/ipsec.h" bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { @@ -106,7 +107,7 @@ bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev, if (!mlx5e_check_fragmented_striding_rq_cap(mdev)) return false; - if (MLX5_IPSEC_DEV(mdev)) + if (mlx5_fpga_is_ipsec_device(mdev)) return false; if (params->xdp_prog) { @@ -2069,7 +2070,7 @@ static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev, int i; #ifdef CONFIG_MLX5_EN_IPSEC - if (MLX5_IPSEC_DEV(mdev)) + if (mlx5_fpga_is_ipsec_device(mdev)) byte_count += MLX5E_METADATA_ETHER_LEN; #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index ca4b55839a8a..4864deed9dc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1795,8 +1795,8 @@ int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe_mpwqe; #ifdef CONFIG_MLX5_EN_IPSEC - if (MLX5_IPSEC_DEV(mdev)) { - netdev_err(netdev, "MPWQE RQ with IPSec offload not supported\n"); + if (mlx5_fpga_is_ipsec_device(mdev)) { + netdev_err(netdev, "MPWQE RQ with Innova IPSec offload not supported\n"); return -EINVAL; } #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index cc67366495b0..22bee4990232 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -124,7 +124,7 @@ struct mlx5_fpga_ipsec { struct ida halloc; }; -static bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) +bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) { if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga)) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h index db88eb4c49e3..8931b5584477 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h @@ -43,6 +43,7 @@ u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev); const struct mlx5_flow_cmds * mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type); void mlx5_fpga_ipsec_build_fs_cmds(void); +bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev); #else static inline const struct mlx5_accel_ipsec_ops *mlx5_fpga_ipsec_ops(struct mlx5_core_dev *mdev) @@ -55,6 +56,7 @@ mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type) } static inline void mlx5_fpga_ipsec_build_fs_cmds(void) {}; +static inline bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) { return false; } #endif /* CONFIG_MLX5_FPGA_IPSEC */ #endif /* __MLX5_FPGA_IPSEC_H__ */ From e33f9f5f2d3a5fa97728a43708f41da2d4faae65 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Mon, 25 Jan 2021 09:58:22 +0200 Subject: [PATCH 03/15] net/mlx5e: Enable XDP for Connect-X IPsec capable devices This limitation was inherited by previous Innova (FPGA) IPsec implementation, it uses its private set of RQ handlers which does not support XDP, for Connect-X this is no longer true. Fix by keeping this limitation only for Innova IPsec supporting devices, as otherwise this limitation effectively wrongly blocks XDP for all future Connect-X devices for all flows even if IPsec offload is not used. Fixes: 2d64663cd559 ("net/mlx5: IPsec: Add HW crypto offload support") Signed-off-by: Raed Salem Reviewed-by: Alaa Hleihel Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0ae22a018dc2..5052820f7a51 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4456,8 +4456,9 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog) return -EINVAL; } - if (MLX5_IPSEC_DEV(priv->mdev)) { - netdev_warn(netdev, "can't set XDP with IPSec offload\n"); + if (mlx5_fpga_is_ipsec_device(priv->mdev)) { + netdev_warn(netdev, + "XDP is not available on Innova cards with IPsec support\n"); return -EINVAL; } From 019f93bc4ba3a0dcb77f448ee77fc4c9c1b89565 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 3 Feb 2021 17:55:58 +0200 Subject: [PATCH 04/15] net/mlx5e: Don't change interrupt moderation params when DIM is enabled When mlx5e_ethtool_set_coalesce doesn't change DIM state (enabled/disabled), it calls mlx5e_set_priv_channels_coalesce unconditionally, which in turn invokes a firmware command to set interrupt moderation parameters. It shouldn't happen while DIM manages those parameters dynamically (it might even be happening at the same time). This patch fixes it by splitting mlx5e_set_priv_channels_coalesce into two functions (for RX and TX) and calling them only when DIM is disabled (for RX and TX respectively). Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 302001d6661e..d7ff5fa45cb7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -525,7 +525,7 @@ static int mlx5e_get_coalesce(struct net_device *netdev, #define MLX5E_MAX_COAL_FRAMES MLX5_MAX_CQ_COUNT static void -mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) +mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) { struct mlx5_core_dev *mdev = priv->mdev; int tc; @@ -540,6 +540,17 @@ mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesc coal->tx_coalesce_usecs, coal->tx_max_coalesced_frames); } + } +} + +static void +mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int i; + + for (i = 0; i < priv->channels.num; ++i) { + struct mlx5e_channel *c = priv->channels.c[i]; mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, coal->rx_coalesce_usecs, @@ -596,7 +607,10 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled; if (!reset_rx && !reset_tx) { - mlx5e_set_priv_channels_coalesce(priv, coal); + if (!coal->use_adaptive_rx_coalesce) + mlx5e_set_priv_channels_rx_coalesce(priv, coal); + if (!coal->use_adaptive_tx_coalesce) + mlx5e_set_priv_channels_tx_coalesce(priv, coal); priv->channels.params = new_channels.params; goto out; } From 65ba8594a238c20e458b3d2d39d91067cbffd0b1 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 3 Feb 2021 18:01:03 +0200 Subject: [PATCH 05/15] net/mlx5e: Change interrupt moderation channel params also when channels are closed struct mlx5e_params contains fields ({rx,tx}_cq_moderation) that depend on two things: whether DIM is enabled and the state of a private flag (MLX5E_PFLAG_{RX,TX}_CQE_BASED_MODER). Whenever the DIM state changes, mlx5e_reset_{rx,tx}_moderation is called to update the fields, however, only if the channels are open. The flow where the channels are closed misses the required update of the fields. This commit moves the calls of mlx5e_reset_{rx,tx}_moderation, so that they run in both flows. Fixes: ebeaf084ad5c ("net/mlx5e: Properly set default values when disabling adaptive moderation") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d7ff5fa45cb7..8612c388db7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -597,24 +597,9 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, tx_moder->pkts = coal->tx_max_coalesced_frames; new_channels.params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce; - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - priv->channels.params = new_channels.params; - goto out; - } - /* we are opened */ - reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled; reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled; - if (!reset_rx && !reset_tx) { - if (!coal->use_adaptive_rx_coalesce) - mlx5e_set_priv_channels_rx_coalesce(priv, coal); - if (!coal->use_adaptive_tx_coalesce) - mlx5e_set_priv_channels_tx_coalesce(priv, coal); - priv->channels.params = new_channels.params; - goto out; - } - if (reset_rx) { u8 mode = MLX5E_GET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_BASED_MODER); @@ -628,6 +613,20 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, mlx5e_reset_tx_moderation(&new_channels.params, mode); } + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + priv->channels.params = new_channels.params; + goto out; + } + + if (!reset_rx && !reset_tx) { + if (!coal->use_adaptive_rx_coalesce) + mlx5e_set_priv_channels_rx_coalesce(priv, coal); + if (!coal->use_adaptive_tx_coalesce) + mlx5e_set_priv_channels_tx_coalesce(priv, coal); + priv->channels.params = new_channels.params; + goto out; + } + err = mlx5e_safe_switch_channels(priv, &new_channels, NULL, NULL); out: From 51d138c2610a236c1ed0059d034ee4c74f452b86 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 23 Nov 2020 08:39:10 +0200 Subject: [PATCH 06/15] net/mlx5: Fix health error state handling Currently, when we discover a fatal error, we are queueing a work that will wait for a lock in order to enter the device to error state. Meanwhile, FW commands are still being processed, and gets timeouts. This can block the driver for few minutes before the work will manage to get the lock and enter to error state. Setting the device to error state before queueing health work, in order to avoid FW commands being processed while the work is waiting for the lock. Fixes: c1d4d2e92ad6 ("net/mlx5: Avoid calling sleeping function by the health poll thread") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/health.c | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 54523bed16cd..0c32c485eb58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -190,6 +190,16 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev) return true; } +static void enter_error_state(struct mlx5_core_dev *dev, bool force) +{ + if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + mlx5_cmd_flush(dev); + } + + mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); +} + void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) { bool err_detected = false; @@ -208,12 +218,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) goto unlock; } - if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */ - dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; - mlx5_cmd_flush(dev); - } - - mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); + enter_error_state(dev, force); unlock: mutex_unlock(&dev->intf_state_mutex); } @@ -613,7 +618,7 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) priv = container_of(health, struct mlx5_priv, health); dev = container_of(priv, struct mlx5_core_dev, priv); - mlx5_enter_error_state(dev, false); + enter_error_state(dev, false); if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { if (mlx5_health_try_recover(dev)) mlx5_core_err(dev, "health recovery failed\n"); @@ -707,8 +712,9 @@ static void poll_health(struct timer_list *t) mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); dev->priv.health.fatal_error = fatal_error; print_health_info(dev); + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_trigger_health_work(dev); - goto out; + return; } count = ioread32be(health->health_counter); From 4d6e6b0c6d4bed8a7128500701354e2dc6098fa3 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 1 Feb 2021 14:01:27 +0200 Subject: [PATCH 07/15] net/mlx5e: Replace synchronize_rcu with synchronize_net The commit cited below switched from using napi_synchronize to synchronize_rcu to have a guarantee that it will finish in finite time. However, on average, synchronize_rcu takes more time than napi_synchronize. Given that it's called multiple times per channel on deactivation, it accumulates to a significant amount, which causes timeouts in some applications (for example, when using bonding with NetworkManager). This commit replaces synchronize_rcu with synchronize_net, which is faster when called under rtnl_lock, allowing to speed up the described flow. Fixes: 9c25a22dfb00 ("net/mlx5e: Use synchronize_rcu to sync with NAPI") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index d487e5e37162..8d991c3b7a50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -83,7 +83,7 @@ static inline void mlx5e_xdp_tx_disable(struct mlx5e_priv *priv) clear_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state); /* Let other device's napi(s) and XSK wakeups see our new state. */ - synchronize_rcu(); + synchronize_net(); } static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index d87c345878d3..f4bce1365639 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -111,7 +111,7 @@ int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params, void mlx5e_close_xsk(struct mlx5e_channel *c) { clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state); - synchronize_rcu(); /* Sync with the XSK wakeup and with NAPI. */ + synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */ mlx5e_close_rq(&c->xskrq); mlx5e_close_cq(&c->xskrq.cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 6a1d82503ef8..0f13b661f7f9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -663,7 +663,7 @@ void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) priv_rx = mlx5e_get_ktls_rx_priv_ctx(tls_ctx); set_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags); mlx5e_set_ktls_rx_priv_ctx(tls_ctx, NULL); - synchronize_rcu(); /* Sync with NAPI */ + synchronize_net(); /* Sync with NAPI */ if (!cancel_work_sync(&priv_rx->rule.work)) /* completion is needed, as the priv_rx in the add flow * is maintained on the wqe info (wi), not on the socket. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5052820f7a51..3edc826cc6bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -915,7 +915,7 @@ void mlx5e_activate_rq(struct mlx5e_rq *rq) void mlx5e_deactivate_rq(struct mlx5e_rq *rq) { clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); - synchronize_rcu(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ + synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ } void mlx5e_close_rq(struct mlx5e_rq *rq) @@ -1349,7 +1349,7 @@ void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq) struct mlx5_wq_cyc *wq = &sq->wq; clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - synchronize_rcu(); /* Sync with NAPI to prevent netif_tx_wake_queue. */ + synchronize_net(); /* Sync with NAPI to prevent netif_tx_wake_queue. */ mlx5e_tx_disable_queue(sq->txq); @@ -1424,7 +1424,7 @@ void mlx5e_activate_icosq(struct mlx5e_icosq *icosq) void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq) { clear_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state); - synchronize_rcu(); /* Sync with NAPI. */ + synchronize_net(); /* Sync with NAPI. */ } void mlx5e_close_icosq(struct mlx5e_icosq *sq) @@ -1503,7 +1503,7 @@ void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq) struct mlx5e_channel *c = sq->channel; clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - synchronize_rcu(); /* Sync with NAPI. */ + synchronize_net(); /* Sync with NAPI. */ mlx5e_destroy_sq(c->mdev, sq->sqn); mlx5e_free_xdpsq_descs(sq); From ebf79b6be67c0a77a9ab7cdf74c43fd7d9619f0c Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 10 Feb 2021 09:25:28 +0200 Subject: [PATCH 08/15] net/mlx5e: Fix CQ params of ICOSQ and async ICOSQ The commit mentioned below has split the parameters of ICOSQ and async ICOSQ, but it contained a typo: the CQ parameters were swapped for ICOSQ and async ICOSQ. Async ICOSQ is longer than the normal ICOSQ, and the CQ size must be the same as the size of the corresponding SQ, but due to this bug, the CQ of async ICOSQ was much shorter than async ICOSQ itself. It led to overflows of the CQ with such messages in dmesg, in particular, when running multiple kTLS-offloaded streams: mlx5_core 0000:08:00.0: cq_err_event_notifier:529:(pid 9422): CQ error on CQN 0x406, syndrome 0x1 mlx5_core 0000:08:00.0 eth2: mlx5e_cq_error_event: cqn=0x000406 event=0x04 This commit fixes the issue by using the corresponding parameters for ICOSQ and async ICOSQ. Fixes: c293ac927fbb ("net/mlx5e: Refactor build channel params") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3edc826cc6bb..a2e0b548bf57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1827,12 +1827,12 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, mlx5e_build_create_cq_param(&ccp, c); - err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, &c->async_icosq.cq); if (err) return err; - err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, &c->icosq.cq); if (err) goto err_close_async_icosq_cq; From b850bbff965129c34f50962638c0a66c82563536 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 22 Jan 2021 18:42:29 +0200 Subject: [PATCH 09/15] net/mlx5e: kTLS, Use refcounts to free kTLS RX priv context wait_for_resync is unreliable - if it timeouts, priv_rx will be freed anyway. However, mlx5e_ktls_handle_get_psv_completion will be called sooner or later, leading to use-after-free. For example, it can happen if a CQ error happened, and ICOSQ stopped, but later on the queues are destroyed, and ICOSQ is flushed with mlx5e_free_icosq_descs. This patch converts the lifecycle of priv_rx to fully refcount-based, so that the struct won't be freed before the refcount goes to zero. Fixes: 0419d8c9d8f8 ("net/mlx5e: kTLS, Add kTLS RX resync support") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en_accel/ktls_rx.c | 64 +++++++++---------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 0f13b661f7f9..d06532d0baa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -57,6 +57,20 @@ struct mlx5e_ktls_offload_context_rx { struct mlx5e_ktls_rx_resync_ctx resync; }; +static bool mlx5e_ktls_priv_rx_put(struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + if (!refcount_dec_and_test(&priv_rx->resync.refcnt)) + return false; + + kfree(priv_rx); + return true; +} + +static void mlx5e_ktls_priv_rx_get(struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + refcount_inc(&priv_rx->resync.refcnt); +} + static int mlx5e_ktls_create_tir(struct mlx5_core_dev *mdev, u32 *tirn, u32 rqtn) { int err, inlen; @@ -326,7 +340,7 @@ static void resync_handle_work(struct work_struct *work) priv_rx = container_of(resync, struct mlx5e_ktls_offload_context_rx, resync); if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) { - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); return; } @@ -334,7 +348,7 @@ static void resync_handle_work(struct work_struct *work) sq = &c->async_icosq; if (resync_post_get_progress_params(sq, priv_rx)) - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); } static void resync_init(struct mlx5e_ktls_rx_resync_ctx *resync, @@ -377,7 +391,11 @@ static int resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_rx return err; } -/* Function is called with elevated refcount, it decreases it. */ +/* Function can be called with the refcount being either elevated or not. + * It decreases the refcount and may free the kTLS priv context. + * Refcount is not elevated only if tls_dev_del has been called, but GET_PSV was + * already in flight. + */ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, struct mlx5e_icosq *sq) { @@ -410,7 +428,7 @@ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, tls_offload_rx_resync_async_request_end(priv_rx->sk, cpu_to_be32(hw_seq)); priv_rx->stats->tls_resync_req_end++; out: - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); dma_unmap_single(dev, buf->dma_addr, PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE); kfree(buf); } @@ -431,9 +449,9 @@ static bool resync_queue_get_psv(struct sock *sk) return false; resync = &priv_rx->resync; - refcount_inc(&resync->refcnt); + mlx5e_ktls_priv_rx_get(priv_rx); if (unlikely(!queue_work(resync->priv->tls->rx_wq, &resync->work))) - refcount_dec(&resync->refcnt); + mlx5e_ktls_priv_rx_put(priv_rx); return true; } @@ -625,31 +643,6 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, return err; } -/* Elevated refcount on the resync object means there are - * outstanding operations (uncompleted GET_PSV WQEs) that - * will read the resync / priv_rx objects once completed. - * Wait for them to avoid use-after-free. - */ -static void wait_for_resync(struct net_device *netdev, - struct mlx5e_ktls_rx_resync_ctx *resync) -{ -#define MLX5E_KTLS_RX_RESYNC_TIMEOUT 20000 /* msecs */ - unsigned long exp_time = jiffies + msecs_to_jiffies(MLX5E_KTLS_RX_RESYNC_TIMEOUT); - unsigned int refcnt; - - do { - refcnt = refcount_read(&resync->refcnt); - if (refcnt == 1) - return; - - msleep(20); - } while (time_before(jiffies, exp_time)); - - netdev_warn(netdev, - "Failed waiting for kTLS RX resync refcnt to be released (%u).\n", - refcnt); -} - void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) { struct mlx5e_ktls_offload_context_rx *priv_rx; @@ -671,8 +664,7 @@ void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) wait_for_completion(&priv_rx->add_ctx); resync = &priv_rx->resync; if (cancel_work_sync(&resync->work)) - refcount_dec(&resync->refcnt); - wait_for_resync(netdev, resync); + mlx5e_ktls_priv_rx_put(priv_rx); priv_rx->stats->tls_del++; if (priv_rx->rule.rule) @@ -680,5 +672,9 @@ void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) mlx5_core_destroy_tir(mdev, priv_rx->tirn); mlx5_ktls_destroy_key(mdev, priv_rx->key_id); - kfree(priv_rx); + /* priv_rx should normally be freed here, but if there is an outstanding + * GET_PSV, deallocation will be delayed until the CQE for GET_PSV is + * processed. + */ + mlx5e_ktls_priv_rx_put(priv_rx); } From d89ddaae1766f8fe571ea6eb63ec098ff556f1dd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 30 Dec 2020 09:58:32 +0200 Subject: [PATCH 10/15] net/mlx5: Disable devlink reload for multi port slave device Devlink reload can't be allowed on a multi port slave device, because reload of slave device doesn't take effect. The right flow is to disable devlink reload for multi port slave device. Hence, disabling it in mlx5_core probing. Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index ca6f2fc39ea0..ba1a4ae28097 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1396,7 +1396,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); pci_save_state(pdev); - devlink_reload_enable(devlink); + if (!mlx5_core_is_mp_slave(dev)) + devlink_reload_enable(devlink); return 0; err_load_one: From c70f8597fcc1399ef6d5b5ce648a31d887d5dba2 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 30 Dec 2020 10:58:34 +0200 Subject: [PATCH 11/15] net/mlx5: Disallow RoCE on multi port slave device In dual port mode, setting roce enabled/disable for the slave device have no effect. e.g.: the slave device roce status remain unchanged. Therefore disable it and add an error message. Enable or disable roce of the master device affect both master and slave devices. Fixes: cc9defcbb8fa ("net/mlx5: Handle "enable_roce" devlink param") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 3261d0dc1104..317ce6b80b23 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -273,6 +273,10 @@ static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id, NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE"); return -EOPNOTSUPP; } + if (mlx5_core_is_mp_slave(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi port slave device can't configure RoCE"); + return -EOPNOTSUPP; + } return 0; } From 7ab91f2b03367f9d25dd807ebdfb0d67295e0e41 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 3 Jan 2021 10:09:59 +0200 Subject: [PATCH 12/15] net/mlx5: Disallow RoCE on lag device In lag mode, setting roce enabled/disable of lag device have no effect. e.g.: bond device (roce/vf_lag) roce status remain unchanged. Therefore disable it and add an error message. Fixes: cc9defcbb8fa ("net/mlx5: Handle "enable_roce" devlink param") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 317ce6b80b23..c7073193db14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -273,8 +273,8 @@ static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id, NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE"); return -EOPNOTSUPP; } - if (mlx5_core_is_mp_slave(dev)) { - NL_SET_ERR_MSG_MOD(extack, "Multi port slave device can't configure RoCE"); + if (mlx5_core_is_mp_slave(dev) || mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi port slave/Lag device can't configure RoCE"); return -EOPNOTSUPP; } From edac23c2b3d3ac64cfcd351087295893671adbf5 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 31 Dec 2020 15:04:08 +0200 Subject: [PATCH 13/15] net/mlx5: Disable devlink reload for lag devices Devlink reload can't be allowed on lag devices since reloading one lag device will cause traffic on the bond to get stucked. Users who wish to reload a lag device, need to remove the device from the bond, and only then reload it. Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index c7073193db14..41474e42a819 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -128,6 +128,11 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, { struct mlx5_core_dev *dev = devlink_priv(devlink); + if (mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported in Lag mode\n"); + return -EOPNOTSUPP; + } + switch (action) { case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: mlx5_unload_one(dev, false); From a2173131526dc845eb1968a15bc192b3fc2ff000 Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Tue, 12 Jan 2021 12:37:37 +0000 Subject: [PATCH 14/15] net/mlx5e: CT: manage the lifetime of the ct entry object The ct entry object is accessed by the ct add, del, stats and restore methods. In addition, it is referenced from several hash tables. The lifetime of the ct entry object was not managed which triggered race conditions as in the following kasan dump: [ 3374.973945] ================================================================== [ 3374.988552] BUG: KASAN: use-after-free in memcmp+0x4c/0x98 [ 3374.999590] Read of size 1 at addr ffff00036129ea55 by task ksoftirqd/1/15 [ 3375.016415] CPU: 1 PID: 15 Comm: ksoftirqd/1 Tainted: G O 5.4.31+ #1 [ 3375.055301] Call trace: [ 3375.060214] dump_backtrace+0x0/0x238 [ 3375.067580] show_stack+0x24/0x30 [ 3375.074244] dump_stack+0xe0/0x118 [ 3375.081085] print_address_description.isra.9+0x74/0x3d0 [ 3375.091771] __kasan_report+0x198/0x1e8 [ 3375.099486] kasan_report+0xc/0x18 [ 3375.106324] __asan_load1+0x60/0x68 [ 3375.113338] memcmp+0x4c/0x98 [ 3375.119409] mlx5e_tc_ct_restore_flow+0x3a4/0x6f8 [mlx5_core] [ 3375.131073] mlx5e_rep_tc_update_skb+0x1d4/0x2f0 [mlx5_core] [ 3375.142553] mlx5e_handle_rx_cqe_rep+0x198/0x308 [mlx5_core] [ 3375.154034] mlx5e_poll_rx_cq+0x2a0/0x1060 [mlx5_core] [ 3375.164459] mlx5e_napi_poll+0x1d4/0xa78 [mlx5_core] [ 3375.174453] net_rx_action+0x28c/0x7a8 [ 3375.182004] __do_softirq+0x1b4/0x5d0 Manage the lifetime of the ct entry object by using synchornization mechanisms for concurrent access. Fixes: ac991b48d43c ("net/mlx5e: CT: Offload established flows") Signed-off-by: Roi Dayan Signed-off-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 259 +++++++++++++----- 1 file changed, 192 insertions(+), 67 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 6bc6b48a56dc..24e2c0d955b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "lib/fs_chains.h" @@ -51,11 +52,11 @@ struct mlx5_tc_ct_priv { struct mlx5_flow_table *ct_nat; struct mlx5_flow_table *post_ct; struct mutex control_lock; /* guards parallel adds/dels */ - struct mutex shared_counter_lock; struct mapping_ctx *zone_mapping; struct mapping_ctx *labels_mapping; enum mlx5_flow_namespace_type ns_type; struct mlx5_fs_chains *chains; + spinlock_t ht_lock; /* protects ft entries */ }; struct mlx5_ct_flow { @@ -124,6 +125,10 @@ struct mlx5_ct_counter { bool is_shared; }; +enum { + MLX5_CT_ENTRY_FLAG_VALID, +}; + struct mlx5_ct_entry { struct rhash_head node; struct rhash_head tuple_node; @@ -134,6 +139,12 @@ struct mlx5_ct_entry { struct mlx5_ct_tuple tuple; struct mlx5_ct_tuple tuple_nat; struct mlx5_ct_zone_rule zone_rules[2]; + + struct mlx5_tc_ct_priv *ct_priv; + struct work_struct work; + + refcount_t refcnt; + unsigned long flags; }; static const struct rhashtable_params cts_ht_params = { @@ -740,6 +751,87 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, return err; } +static bool +mlx5_tc_ct_entry_valid(struct mlx5_ct_entry *entry) +{ + return test_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); +} + +static struct mlx5_ct_entry * +mlx5_tc_ct_entry_get(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_tuple *tuple) +{ + struct mlx5_ct_entry *entry; + + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, tuple, + tuples_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) { + return entry; + } else if (!entry) { + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, + tuple, tuples_nat_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) + return entry; + } + + return entry ? ERR_PTR(-EINVAL) : NULL; +} + +static void mlx5_tc_ct_entry_remove_from_tuples(struct mlx5_ct_entry *entry) +{ + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); + rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, + tuples_ht_params); +} + +static void mlx5_tc_ct_entry_del(struct mlx5_ct_entry *entry) +{ + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + mlx5_tc_ct_entry_del_rules(ct_priv, entry); + + spin_lock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_counter_put(ct_priv, entry); + kfree(entry); +} + +static void +mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) +{ + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + mlx5_tc_ct_entry_del(entry); +} + +static void mlx5_tc_ct_entry_del_work(struct work_struct *work) +{ + struct mlx5_ct_entry *entry = container_of(work, struct mlx5_ct_entry, work); + + mlx5_tc_ct_entry_del(entry); +} + +static void +__mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) +{ + struct mlx5e_priv *priv; + + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + priv = netdev_priv(entry->ct_priv->netdev); + INIT_WORK(&entry->work, mlx5_tc_ct_entry_del_work); + queue_work(priv->wq, &entry->work); +} + static struct mlx5_ct_counter * mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv) { @@ -792,16 +884,26 @@ mlx5_tc_ct_shared_counter_get(struct mlx5_tc_ct_priv *ct_priv, } /* Use the same counter as the reverse direction */ - mutex_lock(&ct_priv->shared_counter_lock); - rev_entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &rev_tuple, - tuples_ht_params); - if (rev_entry) { - if (refcount_inc_not_zero(&rev_entry->counter->refcount)) { - mutex_unlock(&ct_priv->shared_counter_lock); - return rev_entry->counter; - } + spin_lock_bh(&ct_priv->ht_lock); + rev_entry = mlx5_tc_ct_entry_get(ct_priv, &rev_tuple); + + if (IS_ERR(rev_entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + goto create_counter; } - mutex_unlock(&ct_priv->shared_counter_lock); + + if (rev_entry && refcount_inc_not_zero(&rev_entry->counter->refcount)) { + ct_dbg("Using shared counter entry=0x%p rev=0x%p\n", entry, rev_entry); + shared_counter = rev_entry->counter; + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(rev_entry); + return shared_counter; + } + + spin_unlock_bh(&ct_priv->ht_lock); + +create_counter: shared_counter = mlx5_tc_ct_counter_create(ct_priv); if (IS_ERR(shared_counter)) { @@ -866,10 +968,14 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, if (!meta_action) return -EOPNOTSUPP; - entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (entry) - return 0; + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (entry && refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_put(entry); + return -EEXIST; + } + spin_unlock_bh(&ct_priv->ht_lock); entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@ -878,6 +984,8 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, entry->tuple.zone = ft->zone; entry->cookie = flow->cookie; entry->restore_cookie = meta_action->ct_metadata.cookie; + refcount_set(&entry->refcnt, 2); + entry->ct_priv = ct_priv; err = mlx5_tc_ct_rule_to_tuple(&entry->tuple, flow_rule); if (err) @@ -888,35 +996,40 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, if (err) goto err_set; - err = rhashtable_insert_fast(&ct_priv->ct_tuples_ht, - &entry->tuple_node, - tuples_ht_params); + spin_lock_bh(&ct_priv->ht_lock); + + err = rhashtable_lookup_insert_fast(&ft->ct_entries_ht, &entry->node, + cts_ht_params); + if (err) + goto err_entries; + + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_ht, + &entry->tuple_node, + tuples_ht_params); if (err) goto err_tuple; if (memcmp(&entry->tuple, &entry->tuple_nat, sizeof(entry->tuple))) { - err = rhashtable_insert_fast(&ct_priv->ct_tuples_nat_ht, - &entry->tuple_nat_node, - tuples_nat_ht_params); + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); if (err) goto err_tuple_nat; } + spin_unlock_bh(&ct_priv->ht_lock); err = mlx5_tc_ct_entry_add_rules(ct_priv, flow_rule, entry, ft->zone_restore_id); if (err) goto err_rules; - err = rhashtable_insert_fast(&ft->ct_entries_ht, &entry->node, - cts_ht_params); - if (err) - goto err_insert; + set_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); + mlx5_tc_ct_entry_put(entry); /* this function reference */ return 0; -err_insert: - mlx5_tc_ct_entry_del_rules(ct_priv, entry); err_rules: + spin_lock_bh(&ct_priv->ht_lock); if (mlx5_tc_ct_entry_has_nat(entry)) rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, &entry->tuple_nat_node, tuples_nat_ht_params); @@ -925,47 +1038,43 @@ mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, &entry->tuple_node, tuples_ht_params); err_tuple: + rhashtable_remove_fast(&ft->ct_entries_ht, + &entry->node, + cts_ht_params); +err_entries: + spin_unlock_bh(&ct_priv->ht_lock); err_set: kfree(entry); - netdev_warn(ct_priv->netdev, - "Failed to offload ct entry, err: %d\n", err); + if (err != -EEXIST) + netdev_warn(ct_priv->netdev, "Failed to offload ct entry, err: %d\n", err); return err; } -static void -mlx5_tc_ct_del_ft_entry(struct mlx5_tc_ct_priv *ct_priv, - struct mlx5_ct_entry *entry) -{ - mlx5_tc_ct_entry_del_rules(ct_priv, entry); - mutex_lock(&ct_priv->shared_counter_lock); - if (mlx5_tc_ct_entry_has_nat(entry)) - rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, - &entry->tuple_nat_node, - tuples_nat_ht_params); - rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, - tuples_ht_params); - mutex_unlock(&ct_priv->shared_counter_lock); - mlx5_tc_ct_counter_put(ct_priv, entry); - -} - static int mlx5_tc_ct_block_flow_offload_del(struct mlx5_ct_ft *ft, struct flow_cls_offload *flow) { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; unsigned long cookie = flow->cookie; struct mlx5_ct_entry *entry; - entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (!entry) + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); return -ENOENT; + } - mlx5_tc_ct_del_ft_entry(ft->ct_priv, entry); - WARN_ON(rhashtable_remove_fast(&ft->ct_entries_ht, - &entry->node, - cts_ht_params)); - kfree(entry); + if (!mlx5_tc_ct_entry_valid(entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(entry); return 0; } @@ -974,19 +1083,30 @@ static int mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft, struct flow_cls_offload *f) { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; unsigned long cookie = f->cookie; struct mlx5_ct_entry *entry; u64 lastuse, packets, bytes; - entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, - cts_ht_params); - if (!entry) + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); return -ENOENT; + } + + if (!mlx5_tc_ct_entry_valid(entry) || !refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + spin_unlock_bh(&ct_priv->ht_lock); mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse); flow_stats_update(&f->stats, bytes, packets, 0, lastuse, FLOW_ACTION_HW_STATS_DELAYED); + mlx5_tc_ct_entry_put(entry); return 0; } @@ -1478,11 +1598,9 @@ mlx5_tc_ct_add_ft_cb(struct mlx5_tc_ct_priv *ct_priv, u16 zone, static void mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg) { - struct mlx5_tc_ct_priv *ct_priv = arg; struct mlx5_ct_entry *entry = ptr; - mlx5_tc_ct_del_ft_entry(ct_priv, entry); - kfree(entry); + mlx5_tc_ct_entry_put(entry); } static void @@ -1960,6 +2078,7 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, goto err_mapping_labels; } + spin_lock_init(&ct_priv->ht_lock); ct_priv->ns_type = ns_type; ct_priv->chains = chains; ct_priv->netdev = priv->netdev; @@ -1994,7 +2113,6 @@ mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, idr_init(&ct_priv->fte_ids); mutex_init(&ct_priv->control_lock); - mutex_init(&ct_priv->shared_counter_lock); rhashtable_init(&ct_priv->zone_ht, &zone_params); rhashtable_init(&ct_priv->ct_tuples_ht, &tuples_ht_params); rhashtable_init(&ct_priv->ct_tuples_nat_ht, &tuples_nat_ht_params); @@ -2037,7 +2155,6 @@ mlx5_tc_ct_clean(struct mlx5_tc_ct_priv *ct_priv) rhashtable_destroy(&ct_priv->ct_tuples_nat_ht); rhashtable_destroy(&ct_priv->zone_ht); mutex_destroy(&ct_priv->control_lock); - mutex_destroy(&ct_priv->shared_counter_lock); idr_destroy(&ct_priv->fte_ids); kfree(ct_priv); } @@ -2059,14 +2176,22 @@ mlx5e_tc_ct_restore_flow(struct mlx5_tc_ct_priv *ct_priv, if (!mlx5_tc_ct_skb_to_tuple(skb, &tuple, zone)) return false; - entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &tuple, - tuples_ht_params); - if (!entry) - entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, - &tuple, tuples_nat_ht_params); - if (!entry) + spin_lock(&ct_priv->ht_lock); + + entry = mlx5_tc_ct_entry_get(ct_priv, &tuple); + if (!entry) { + spin_unlock(&ct_priv->ht_lock); return false; + } + + if (IS_ERR(entry)) { + spin_unlock(&ct_priv->ht_lock); + return false; + } + spin_unlock(&ct_priv->ht_lock); tcf_ct_flow_table_restore_skb(skb, entry->restore_cookie); + __mlx5_tc_ct_entry_put(entry); + return true; } From e1c3940c6003d820c787473c65711b49c2d1bc42 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 26 Jan 2021 16:46:11 +0200 Subject: [PATCH 15/15] net/mlx5e: Check tunnel offload is required before setting SWP Check that tunnel offload is required before setting Software Parser offsets to get Geneve HW offload. In case of Geneve packet we check HW offload support of SWP in mlx5e_tunnel_features_check() and set features accordingly, this should be reflected in skb offload requested by the kernel and we should add the Software Parser offsets only if requested. Otherwise, in case HW doesn't support SWP for Geneve, data path will mistakenly try to offload Geneve SKBs with skb->encapsulation set, regardless of whether offload was requested or not on this specific SKB. Fixes: e3cfc7e6b7bd ("net/mlx5e: TX, Add geneve tunnel stateless offload support") Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 1fae7fab8297..ff81b69a59a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -173,7 +173,7 @@ static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, #endif #if IS_ENABLED(CONFIG_GENEVE) - if (skb->encapsulation) + if (skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL) mlx5e_tx_tunnel_accel(skb, eseg, ihs); #endif