From 3df5dd46fca4b20efc4767c61d8ecc7249e83f5b Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:57:58 +0200 Subject: [PATCH 1/5] net/mlx5: Add max_tx_speed and its CAP bit to IFC Introduce the max_tx_speed field to the query and modify_vport_state structures. Add the esw_vport_state_max_tx_speed capability bit, indicating the firmware support modifying the max_tx_speed field via the MODIFY_VPORT_STATE command. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9dcd4bf355d..e844cfa4fe0a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1071,7 +1071,9 @@ struct mlx5_ifc_e_switch_cap_bits { u8 esw_shared_ingress_acl[0x1]; u8 esw_uplink_ingress_acl[0x1]; u8 root_ft_on_other_esw[0x1]; - u8 reserved_at_a[0xf]; + u8 reserved_at_a[0x1]; + u8 esw_vport_state_max_tx_speed[0x1]; + u8 reserved_at_c[0xd]; u8 esw_functions_changed[0x1]; u8 reserved_at_1a[0x1]; u8 ecpf_vport_exists[0x1]; @@ -5445,7 +5447,8 @@ struct mlx5_ifc_query_vport_state_out_bits { u8 reserved_at_40[0x20]; - u8 reserved_at_60[0x18]; + u8 max_tx_speed[0x10]; + u8 reserved_at_70[0x8]; u8 admin_state[0x4]; u8 state[0x4]; }; @@ -7778,7 +7781,7 @@ struct mlx5_ifc_modify_vport_state_in_bits { u8 reserved_at_41[0xf]; u8 vport_number[0x10]; - u8 reserved_at_60[0x10]; + u8 max_tx_speed[0x10]; u8 ingress_connect[0x1]; u8 egress_connect[0x1]; u8 ingress_connect_valid[0x1]; From 50f1d188c580222d7a73e96a338a5dc82360ccc0 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:05 +0200 Subject: [PATCH 2/5] net/mlx5: Propagate LAG effective max_tx_speed to vports Currently, vports report only their parent's uplink speed, which in LAG setups does not reflect the true aggregated bandwidth. This makes it hard for upper-layer software to optimize load balancing decisions based on accurate bandwidth information. Fix the issue by calculating the possible maximum speed of a LAG as the sum of speeds of all active uplinks that are part of the LAG. Propagate this effective max speed to vports associated with the LAG whenever a relevant event occurs, such as physical port link state changes or LAG creation/modification. With this change, upper-layer components receive accurate bandwidth information corresponding to the active members of the LAG and can make better load balancing decisions. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 158 ++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 9 + .../ethernet/mellanox/mlx5/core/mlx5_core.h | 1 + .../net/ethernet/mellanox/mlx5/core/port.c | 24 +++ .../net/ethernet/mellanox/mlx5/core/vport.c | 45 +++++ include/linux/mlx5/vport.h | 4 + 6 files changed, 241 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index a459a30f36ca..c9d943a230b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond) ldev->mode != MLX5_LAG_MODE_MPESW; } +#ifdef CONFIG_MLX5_ESWITCH +static int +mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed, + int (*get_speed)(struct mlx5_core_dev *, u32 *)) +{ + struct mlx5_core_dev *pf_mdev; + int pf_idx; + u32 speed; + int ret; + + *sum_speed = 0; + mlx5_ldev_for_each(pf_idx, 0, ldev) { + pf_mdev = ldev->pf[pf_idx].dev; + if (!pf_mdev) + continue; + + ret = get_speed(pf_mdev, &speed); + if (ret) { + mlx5_core_dbg(pf_mdev, + "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n", + get_speed, dev_name(pf_mdev->device), + ret); + return ret; + } + + *sum_speed += speed; + } + + return 0; +} + +static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, max_speed, + mlx5_port_max_linkspeed); +} + +static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, + u32 speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + struct mlx5_vport *vport; + unsigned long i; + int ret; + + if (!esw) + return; + + if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed)) + return; + + mlx5_esw_for_each_vport(esw, i, vport) { + if (!vport) + continue; + + if (vport->vport == MLX5_VPORT_UPLINK) + continue; + + ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod, + vport->vport, true, speed); + if (ret) + mlx5_core_dbg(mdev, + "Failed to set vport %d speed %d, err=%d\n", + vport->vport, speed, ret); + } +} + +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + + speed = ldev->tracker.bond_speed_mbps; + + if (speed == SPEED_UNKNOWN) + return; + + /* If speed is not set, use the sum of max speeds of all PFs */ + if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) + return; + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} + +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + int ret; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + ret = mlx5_port_oper_linkspeed(mdev, &speed); + if (ret) { + mlx5_core_dbg(mdev, + "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n", + dev_name(mdev->device), ret); + continue; + } + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} +#endif + static void mlx5_do_bond(struct mlx5_lag *ldev) { int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); @@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) ndev); dev_put(ndev); } + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { + mlx5_lag_reset_vports_speed(ldev); mlx5_disable_lag(ldev); } } @@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev, return 1; } +static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, + struct net_device *ndev) +{ + struct ethtool_link_ksettings lksettings; + struct net_device *bond_dev; + int err; + + if (netif_is_lag_master(ndev)) + bond_dev = ndev; + else + bond_dev = netdev_master_upper_dev_get(ndev); + + if (!bond_dev) { + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + err = __ethtool_get_link_ksettings(bond_dev, &lksettings); + if (err) { + netdev_dbg(bond_dev, + "Failed to get speed for bond dev %s, err=%d\n", + bond_dev->name, err); + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + if (lksettings.base.speed == SPEED_UNKNOWN) + tracker->bond_speed_mbps = 0; + else + tracker->bond_speed_mbps = lksettings.base.speed; +} + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, break; } + if (changed) + mlx5_lag_update_tracker_speed(&tracker, ndev); + ldev->tracker = tracker; if (changed) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 4918eee2b3da..8de5640a0161 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -48,6 +48,7 @@ struct lag_tracker { unsigned int is_bonded:1; unsigned int has_inactive:1; enum netdev_lag_hash hash_type; + u32 bond_speed_mbps; }; /* LAG data of a ConnectX card. @@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev); void mlx5_lag_add_devices(struct mlx5_lag *ldev); struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev); +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev); +#else +static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {} +static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {} +#endif + static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) { if (!MLX5_CAP_GEN(dev, vport_group_manager) || diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index cfebc110c02f..9fdb9a543cf1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, struct mlx5_link_info *info, bool force_legacy); +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 85a9e534f442..83044c9b6b41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, return link_modes; } +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) +{ + const struct mlx5_link_info *table; + struct mlx5_port_eth_proto eproto; + u32 oper_speed = 0; + u32 max_size; + bool ext; + int err; + int i; + + ext = mlx5_ptys_ext_supported(mdev); + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) + return err; + + mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false); + for (i = 0; i < max_size; ++i) + if (eproto.oper & MLX5E_PROT_MASK(i)) + oper_speed = max(oper_speed, table[i].speed); + + *speed = oper_speed; + return 0; +} + int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) { const struct mlx5_link_info *table; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 306affbcfd3b..78b1b291cfa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) return MLX5_GET(query_vport_state_out, out, state); } +static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, + u8 *admin_state) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + *admin_state = MLX5_GET(query_vport_state_out, out, admin_state); + return 0; +} + int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state) { @@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed) +{ + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; + u8 admin_state; + int err; + + err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport, + &admin_state); + if (err) + return err; + + MLX5_SET(modify_vport_state_in, in, opcode, + MLX5_CMD_OP_MODIFY_VPORT_STATE); + MLX5_SET(modify_vport_state_in, in, op_mod, opmod); + MLX5_SET(modify_vport_state_in, in, vport_number, vport); + MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); + MLX5_SET(modify_vport_state_in, in, admin_state, admin_state); + MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed); + + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); +} + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index f876bfc0669c..2acf10e9f60a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -41,6 +41,8 @@ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ mlx5_core_is_pf(mdev)) +#define MLX5_MAX_TX_SPEED_UNIT 100 + enum { MLX5_CAP_INLINE_MODE_L2, MLX5_CAP_INLINE_MODE_VPORT_CONTEXT, @@ -58,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, bool other, u8 *addr); int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr); From 28ea6036dad268a055b693d9c06a6f3d126096d3 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:13 +0200 Subject: [PATCH 3/5] net/mlx5: Handle port and vport speed change events in MPESW Add port change event handling logic for MPESW LAG mode, ensuring VFs are updated when the speed of LAG physical ports changes. This triggers a speed update workflow when relevant port state changes occur, enabling consistent and accurate reporting of VF bandwidth. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 38 ++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 2 + .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 39 +++++++++++++++++++ .../ethernet/mellanox/mlx5/core/lag/mpesw.h | 14 +++++++ .../net/ethernet/mellanox/mlx5/core/vport.c | 29 ++++++++++++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/vport.h | 2 + 7 files changed, 121 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index c9d943a230b5..2d6024ebe346 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -233,14 +233,25 @@ static void mlx5_ldev_free(struct kref *ref) { struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref); struct net *net; + int i; if (ldev->nb.notifier_call) { net = read_pnet(&ldev->net); unregister_netdevice_notifier_net(net, &ldev->nb); } + mlx5_ldev_for_each(i, 0, ldev) { + if (ldev->pf[i].dev && + ldev->pf[i].port_change_nb.nb.notifier_call) { + struct mlx5_nb *nb = &ldev->pf[i].port_change_nb; + + mlx5_eq_notifier_unregister(ldev->pf[i].dev, nb); + } + } + mlx5_lag_mp_cleanup(ldev); cancel_delayed_work_sync(&ldev->bond_work); + cancel_work_sync(&ldev->speed_update_work); destroy_workqueue(ldev->wq); mutex_destroy(&ldev->lock); kfree(ldev); @@ -274,6 +285,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) kref_init(&ldev->ref); mutex_init(&ldev->lock); INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); + INIT_WORK(&ldev->speed_update_work, mlx5_mpesw_speed_update_work); ldev->nb.notifier_call = mlx5_lag_netdev_event; write_pnet(&ldev->net, mlx5_core_net(dev)); @@ -1033,6 +1045,13 @@ static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) mlx5_port_max_linkspeed); } +static int mlx5_lag_sum_devices_oper_speed(struct mlx5_lag *ldev, + u32 *oper_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, oper_speed, + mlx5_port_oper_linkspeed); +} + static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, u32 speed) { @@ -1070,10 +1089,14 @@ void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) u32 speed; int pf_idx; - speed = ldev->tracker.bond_speed_mbps; - - if (speed == SPEED_UNKNOWN) - return; + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + if (mlx5_lag_sum_devices_oper_speed(ldev, &speed)) + return; + } else { + speed = ldev->tracker.bond_speed_mbps; + if (speed == SPEED_UNKNOWN) + return; + } /* If speed is not set, use the sum of max speeds of all PFs */ if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) @@ -1520,6 +1543,10 @@ static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, ldev->pf[fn].dev = dev; dev->priv.lag = ldev; + + MLX5_NB_INIT(&ldev->pf[fn].port_change_nb, + mlx5_lag_mpesw_port_change_event, PORT_CHANGE); + mlx5_eq_notifier_register(dev, &ldev->pf[fn].port_change_nb); } static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, @@ -1531,6 +1558,9 @@ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, if (ldev->pf[fn].dev != dev) return; + if (ldev->pf[fn].port_change_nb.nb.notifier_call) + mlx5_eq_notifier_unregister(dev, &ldev->pf[fn].port_change_nb); + ldev->pf[fn].dev = NULL; dev->priv.lag = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 8de5640a0161..be1afece5fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -39,6 +39,7 @@ struct lag_func { struct mlx5_core_dev *dev; struct net_device *netdev; bool has_drop; + struct mlx5_nb port_change_nb; }; /* Used for collection of netdev event info. */ @@ -67,6 +68,7 @@ struct mlx5_lag { struct lag_tracker tracker; struct workqueue_struct *wq; struct delayed_work bond_work; + struct work_struct speed_update_work; struct notifier_block nb; possible_net_t net; struct lag_mp lag_mp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index 2d86af8f0d9b..04762562d7d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -110,6 +110,8 @@ static int enable_mpesw(struct mlx5_lag *ldev) goto err_rescan_drivers; } + mlx5_lag_set_vports_agg_speed(ldev); + return 0; err_rescan_drivers: @@ -223,3 +225,40 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev) return ldev && ldev->mode == MLX5_LAG_MODE_MPESW; } EXPORT_SYMBOL(mlx5_lag_is_mpesw); + +void mlx5_mpesw_speed_update_work(struct work_struct *work) +{ + struct mlx5_lag *ldev = container_of(work, struct mlx5_lag, + speed_update_work); + + mutex_lock(&ldev->lock); + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + if (ldev->mode_changes_in_progress) + queue_work(ldev->wq, &ldev->speed_update_work); + else + mlx5_lag_set_vports_agg_speed(ldev); + } + + mutex_unlock(&ldev->lock); +} + +int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mlx5_nb *mlx5_nb = container_of(nb, struct mlx5_nb, nb); + struct lag_func *lag_func = container_of(mlx5_nb, + struct lag_func, + port_change_nb); + struct mlx5_core_dev *dev = lag_func->dev; + struct mlx5_lag *ldev = dev->priv.lag; + struct mlx5_eqe *eqe = data; + + if (!ldev) + return NOTIFY_DONE; + + if (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_DOWN || + eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) + queue_work(ldev->wq, &ldev->speed_update_work); + + return NOTIFY_OK; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h index 02520f27a033..f5d9b5c97b0d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h @@ -32,4 +32,18 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev); int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_mpesw_speed_update_work(struct work_struct *work); +int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, void *data); +#else +static inline void mlx5_mpesw_speed_update_work(struct work_struct *work) {} +static inline int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, + void *data) +{ + return NOTIFY_DONE; +} +#endif /* CONFIG_MLX5_ESWITCH */ + #endif /* __MLX5_LAG_MPESW_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 78b1b291cfa4..cb098d3eb2fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -122,6 +122,35 @@ int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_query_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 op_mod, + u16 vport, u8 other_vport, u32 *max_tx_speed) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + u32 state; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, op_mod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + state = MLX5_GET(query_vport_state_out, out, state); + if (state == VPORT_STATE_DOWN) { + *max_tx_speed = 0; + return 0; + } + + *max_tx_speed = MLX5_GET(query_vport_state_out, out, max_tx_speed); + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_vport_max_tx_speed); + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1c54aa6f74fb..9e0ab3cfab73 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1149,6 +1149,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +int mlx5_lag_query_bond_speed(struct net_device *bond_dev, u32 *speed); bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 2acf10e9f60a..dfa2fe32217a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -60,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_query_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 op_mod, + u16 vport, u8 other_vport, u32 *max_tx_speed); int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, From f0b2fde98065e49795ce6824837b3f53fdf16e5d Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Thu, 18 Dec 2025 17:58:20 +0200 Subject: [PATCH 4/5] net/mlx5: Add support for querying bond speed Add mlx5_lag_query_bond_speed() to query the aggregated speed of lag configurations with a bond device. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 27 +++++++++++++++++++ include/linux/mlx5/driver.h | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 2d6024ebe346..9fe47c836ebd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1464,6 +1464,33 @@ static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, tracker->bond_speed_mbps = lksettings.base.speed; } +/* Returns speed in Mbps. */ +int mlx5_lag_query_bond_speed(struct mlx5_core_dev *mdev, u32 *speed) +{ + struct mlx5_lag *ldev; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(mdev); + if (!ldev) { + ret = -ENODEV; + goto unlock; + } + + *speed = ldev->tracker.bond_speed_mbps; + + if (*speed == SPEED_UNKNOWN) { + mlx5_core_dbg(mdev, "Bond speed is unknown\n"); + ret = -EINVAL; + } + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_lag_query_bond_speed); + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9e0ab3cfab73..e2d067b1e67b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1149,7 +1149,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); -int mlx5_lag_query_bond_speed(struct net_device *bond_dev, u32 *speed); +int mlx5_lag_query_bond_speed(struct mlx5_core_dev *dev, u32 *speed); bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); From 49e41f3ea3f7545c732a0b399cb123173afc5cfe Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 12 Jan 2026 08:50:08 +0200 Subject: [PATCH 5/5] net/mlx5: Add IFC bits for extended ETS rate limit bandwidth value Add hardware interface definitions to support extended bandwidth rate limiting in the QoS Enhanced Transmission Selection (ETS) configuration. The new fields include: - max_bw_value: extended from 8-bit to 16-bit in ets_tcn_config_reg, simplifying the implementation by using a single field instead of separate MSB/LSB fields. - qetcr_qshr_max_bw_val_msb: capability bit in qcam_qos_feature_cap_mask indicating device support for the extended 16-bit max_bw_value field. These interface additions are prerequisites for increasing the per-TC rate limit beyond 255 Gbps to support higher-bandwidth NICs. Signed-off-by: Alexei Lazar Reviewed-by: Dragos Tatulea Reviewed-by: Gal Pressman Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1768200608-1543180-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e844cfa4fe0a..775cb0c56865 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11009,7 +11009,9 @@ struct mlx5_ifc_qcam_access_reg_cap_mask { }; struct mlx5_ifc_qcam_qos_feature_cap_mask { - u8 qcam_qos_feature_cap_mask_127_to_1[0x7F]; + u8 qcam_qos_feature_cap_mask_127_to_5[0x7B]; + u8 qetcr_qshr_max_bw_val_msb[0x1]; + u8 qcam_qos_feature_cap_mask_3_to_1[0x3]; u8 qpts_trust_both[0x1]; }; @@ -11965,8 +11967,7 @@ struct mlx5_ifc_ets_tcn_config_reg_bits { u8 reserved_at_20[0xc]; u8 max_bw_units[0x4]; - u8 reserved_at_30[0x8]; - u8 max_bw_value[0x8]; + u8 max_bw_value[0x10]; }; struct mlx5_ifc_ets_global_config_reg_bits {