From 0b7cfa4082fbf550595bc0e40f05614bd83bf0cd Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 23 Dec 2021 14:38:28 +0200 Subject: [PATCH 01/11] net/mlx5e: Fix page DMA map/unmap attributes Driver initiates DMA sync, hence it may skip CPU sync. Add DMA_ATTR_SKIP_CPU_SYNC as input attribute both to dma_map_page and dma_unmap_page to avoid redundant sync with the CPU. When forcing the device to work with SWIOTLB, the extra sync might cause data corruption. The driver unmaps the whole page while the hardware used just a part of the bounce buffer. So syncing overrides the entire page with bounce buffer that only partially contains real data. Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE") Fixes: db05815b36cb ("net/mlx5e: Add XSK zero-copy support") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c index 7b562d2c8a19..279cd8f4e79f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c @@ -11,13 +11,13 @@ static int mlx5e_xsk_map_pool(struct mlx5e_priv *priv, { struct device *dev = mlx5_core_dma_dev(priv->mdev); - return xsk_pool_dma_map(pool, dev, 0); + return xsk_pool_dma_map(pool, dev, DMA_ATTR_SKIP_CPU_SYNC); } static void mlx5e_xsk_unmap_pool(struct mlx5e_priv *priv, struct xsk_buff_pool *pool) { - return xsk_pool_dma_unmap(pool, 0); + return xsk_pool_dma_unmap(pool, DMA_ATTR_SKIP_CPU_SYNC); } static int mlx5e_xsk_get_pools(struct mlx5e_xsk *xsk) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 793511d5ee4c..dfc6604b9538 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -278,8 +278,8 @@ static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq, if (unlikely(!dma_info->page)) return -ENOMEM; - dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0, - PAGE_SIZE, rq->buff.map_dir); + dma_info->addr = dma_map_page_attrs(rq->pdev, dma_info->page, 0, PAGE_SIZE, + rq->buff.map_dir, DMA_ATTR_SKIP_CPU_SYNC); if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) { page_pool_recycle_direct(rq->page_pool, dma_info->page); dma_info->page = NULL; @@ -300,7 +300,8 @@ static inline int mlx5e_page_alloc(struct mlx5e_rq *rq, void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info) { - dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir); + dma_unmap_page_attrs(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir, + DMA_ATTR_SKIP_CPU_SYNC); } void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, From de31854ece175e12ff3c35d07f340988823aed34 Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Wed, 24 Nov 2021 09:37:26 +0200 Subject: [PATCH 02/11] net/mlx5e: Fix nullptr on deleting mirroring rule Deleting a Tc rule with multiple outputs, one of which is internal port, like this one: tc filter del dev enp8s0f0_0 ingress protocol ip pref 5 flower \ dst_mac 0c:42:a1:d1:d0:88 \ src_mac e4:ea:09:08:00:02 \ action tunnel_key set \ src_ip 0.0.0.0 \ dst_ip 7.7.7.8 \ id 8 \ dst_port 4789 \ action mirred egress mirror dev vxlan_sys_4789 pipe \ action mirred egress redirect dev enp8s0f0_1 Triggers a call trace: BUG: kernel NULL pointer dereference, address: 0000000000000230 RIP: 0010:del_sw_hw_rule+0x2b/0x1f0 [mlx5_core] Call Trace: tree_remove_node+0x16/0x30 [mlx5_core] mlx5_del_flow_rules+0x51/0x160 [mlx5_core] __mlx5_eswitch_del_rule+0x4b/0x170 [mlx5_core] mlx5e_tc_del_fdb_flow+0x295/0x550 [mlx5_core] mlx5e_flow_put+0x1f/0x70 [mlx5_core] mlx5e_delete_flower+0x286/0x390 [mlx5_core] tc_setup_cb_destroy+0xac/0x170 fl_hw_destroy_filter+0x94/0xc0 [cls_flower] __fl_delete+0x15e/0x170 [cls_flower] fl_delete+0x36/0x80 [cls_flower] tc_del_tfilter+0x3a6/0x6e0 rtnetlink_rcv_msg+0xe5/0x360 ? rtnl_calcit.isra.0+0x110/0x110 netlink_rcv_skb+0x46/0x110 netlink_unicast+0x16b/0x200 netlink_sendmsg+0x202/0x3d0 sock_sendmsg+0x33/0x40 ____sys_sendmsg+0x1c3/0x200 ? copy_msghdr_from_user+0xd6/0x150 ___sys_sendmsg+0x88/0xd0 ? ___sys_recvmsg+0x88/0xc0 ? do_futex+0x10c/0x460 __sys_sendmsg+0x59/0xa0 do_syscall_64+0x48/0x140 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix by disabling offloading for flows matching esw_is_chain_src_port_rewrite() which have more than one output. Fixes: 10742efc20a4 ("net/mlx5e: VF tunnel TX traffic offloading") Signed-off-by: Dima Chumak Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/eswitch_offloads.c | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 32bc08a39925..ccb66428aeb5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -295,26 +295,28 @@ esw_setup_chain_src_port_rewrite(struct mlx5_flow_destination *dest, int *i) { struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; - int j, err; + int err; if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SRC_REWRITE)) return -EOPNOTSUPP; - for (j = esw_attr->split_count; j < esw_attr->out_count; j++, (*i)++) { - err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, 1, 0, *i); - if (err) - goto err_setup_chain; + /* flow steering cannot handle more than one dest with the same ft + * in a single flow + */ + if (esw_attr->out_count - esw_attr->split_count > 1) + return -EOPNOTSUPP; - if (esw_attr->dests[j].pkt_reformat) { - flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; - flow_act->pkt_reformat = esw_attr->dests[j].pkt_reformat; - } + err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, 1, 0, *i); + if (err) + return err; + + if (esw_attr->dests[esw_attr->split_count].pkt_reformat) { + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_act->pkt_reformat = esw_attr->dests[esw_attr->split_count].pkt_reformat; } - return 0; + (*i)++; -err_setup_chain: - esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, j); - return err; + return 0; } static void esw_cleanup_chain_src_port_rewrite(struct mlx5_eswitch *esw, From 885751eb1b01d276e38f57d78c583e4ce006c5ed Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Wed, 29 Dec 2021 16:10:41 +0200 Subject: [PATCH 03/11] net/mlx5e: Fix wrong usage of fib_info_nh when routes with nexthop objects are used Creating routes with nexthop objects while in switchdev mode leads to access to un-allocated memory and trigger bellow call trace due to hitting WARN_ON. This is caused due to illegal usage of fib_info_nh in TC tunnel FIB event handling to resolve the FIB device while fib_info built in with nexthop. Fixed by ignoring attempts to use nexthop objects with routes until support can be properly added. WARNING: CPU: 1 PID: 1724 at include/net/nexthop.h:468 mlx5e_tc_tun_fib_event+0x448/0x570 [mlx5_core] CPU: 1 PID: 1724 Comm: ip Not tainted 5.15.0_for_upstream_min_debug_2021_11_09_02_04 #1 RIP: 0010:mlx5e_tc_tun_fib_event+0x448/0x570 [mlx5_core] RSP: 0018:ffff8881349f7910 EFLAGS: 00010202 RAX: ffff8881492f1980 RBX: ffff8881349f79e8 RCX: 0000000000000000 RDX: ffff8881349f79e8 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff8881349f7950 R08: 00000000000000fe R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff88811e9d0000 R13: ffff88810eb62000 R14: ffff888106710268 R15: 0000000000000018 FS: 00007f1d5ca6e800(0000) GS:ffff88852c880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffedba44ff8 CR3: 0000000129808004 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: atomic_notifier_call_chain+0x42/0x60 call_fib_notifiers+0x21/0x40 fib_table_insert+0x479/0x6d0 ? try_charge_memcg+0x480/0x6d0 inet_rtm_newroute+0x65/0xb0 rtnetlink_rcv_msg+0x2af/0x360 ? page_add_file_rmap+0x13/0x130 ? do_set_pte+0xcd/0x120 ? rtnl_calcit.isra.0+0x120/0x120 netlink_rcv_skb+0x4e/0xf0 netlink_unicast+0x1ee/0x2b0 netlink_sendmsg+0x22e/0x460 sock_sendmsg+0x33/0x40 ____sys_sendmsg+0x1d1/0x1f0 ___sys_sendmsg+0xab/0xf0 ? __mod_memcg_lruvec_state+0x40/0x60 ? __mod_lruvec_page_state+0x95/0xd0 ? page_add_new_anon_rmap+0x4e/0xf0 ? __handle_mm_fault+0xec6/0x1470 __sys_sendmsg+0x51/0x90 ? internal_get_user_pages_fast+0x480/0xa10 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 8914add2c9e5 ("net/mlx5e: Handle FIB events to update tunnel endpoint device") Signed-off-by: Maor Dickman Reviewed-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 042b1abe1437..62cbd15ffc34 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -1579,6 +1579,8 @@ mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv, struct net_device *fib_dev; fen_info = container_of(info, struct fib_entry_notifier_info, info); + if (fen_info->fi->nh) + return NULL; fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; if (!fib_dev || fib_dev->netdev_ops != &mlx5e_netdev_ops || fen_info->dst_len != 32) From 9e72a55a3c9d54b38a704bb7292d984574a81d9d Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 30 Dec 2021 11:20:10 +0200 Subject: [PATCH 04/11] net/mlx5e: Don't block routes with nexthop objects in SW Routes with nexthop objects is currently not supported by multipath offload and any attempts to use it is blocked, however this also block adding SW routes with nexthop. Resolve this by returning NOTIFY_DONE instead of an error which will allow such a route to be created in SW but not offloaded. This fix also solve an issue which block adding such routes on different devices due to missing check if the route FIB device is one of multipath devices. Fixes: 6a87afc072c3 ("mlx5: Fail attempts to use routes with nexthop objects") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index bf4d3cbefa63..1ca01a5b6cdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -268,10 +268,8 @@ static int mlx5_lag_fib_event(struct notifier_block *nb, fen_info = container_of(info, struct fib_entry_notifier_info, info); fi = fen_info->fi; - if (fi->nh) { - NL_SET_ERR_MSG_MOD(info->extack, "IPv4 route with nexthop objects is not supported"); - return notifier_from_errno(-EINVAL); - } + if (fi->nh) + return NOTIFY_DONE; fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; if (fib_dev != ldev->pf[MLX5_LAG_P1].netdev && fib_dev != ldev->pf[MLX5_LAG_P2].netdev) { From 64050cdad0983ad8060e33c3f4b5aee2366bcebd Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 24 Oct 2021 11:47:41 +0300 Subject: [PATCH 05/11] Revert "net/mlx5e: Block offload of outer header csum for UDP tunnels" This reverts commit 6d6727dddc7f93fcc155cb8d0c49c29ae0e71122. Although the NIC doesn't support offload of outer header CSUM, using gso_partial_features allows offloading the tunnel's segmentation. The driver relies on the stack CSUM calculation of the outer header. For this, NETIF_F_GSO_UDP_TUNNEL_CSUM must be a member of the device's features. Fixes: 6d6727dddc7f ("net/mlx5e: Block offload of outer header csum for UDP tunnels") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 41379844eee1..de8acd3217c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4789,9 +4789,13 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) } if (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev)) { - netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; - netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL; - netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL; + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; } if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_GRE)) { From 01c3fd113ef50490ffd43f78f347ef6bb008510b Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 24 Oct 2021 16:52:23 +0300 Subject: [PATCH 06/11] Revert "net/mlx5e: Block offload of outer header csum for GRE tunnel" This reverts commit 54e1217b90486c94b26f24dcee1ee5ef5372f832. Although the NIC doesn't support offload of outer header CSUM, using gso_partial_features allows offloading the tunnel's segmentation. The driver relies on the stack CSUM calculation of the outer header. For this, NETIF_F_GSO_GRE_CSUM must be a member of the device's features. Fixes: 54e1217b9048 ("net/mlx5e: Block offload of outer header csum for GRE tunnel") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index de8acd3217c1..d92b82cdfd4e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4799,9 +4799,12 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) } if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_GRE)) { - netdev->hw_features |= NETIF_F_GSO_GRE; - netdev->hw_enc_features |= NETIF_F_GSO_GRE; - netdev->gso_partial_features |= NETIF_F_GSO_GRE; + netdev->hw_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; + netdev->hw_enc_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; + netdev->gso_partial_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; } if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_IPIP)) { From b6dfff21a170af5c695ebaa153b7f5e297ddca03 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 16 Jun 2021 10:55:56 +0300 Subject: [PATCH 07/11] net/mlx5e: Fix matching on modified inner ip_ecn bits Tunnel device follows RFC 6040, and during decapsulation inner ip_ecn might change depending on inner and outer ip_ecn as follows: +---------+----------------------------------------+ |Arriving | Arriving Outer Header | | Inner +---------+---------+---------+----------+ | Header | Not-ECT | ECT(0) | ECT(1) | CE | +---------+---------+---------+---------+----------+ | Not-ECT | Not-ECT | Not-ECT | Not-ECT | | | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE* | | ECT(1) | ECT(1) | ECT(1) | ECT(1)* | CE* | | CE | CE | CE | CE | CE | +---------+---------+---------+---------+----------+ Cells marked above are changed from original inner packet ip_ecn value. Tc then matches on the modified inner ip_ecn, but hw offload which matches the inner ip_ecn value before decap, will fail. Fix that by mapping all the cases of outer and inner ip_ecn matching, and only supporting cases where we know inner wouldn't be changed by decap, or in the outer ip_ecn=CE case, inner ip_ecn didn't matter. Fixes: bcef735c59f2 ("net/mlx5e: Offload TC matching on tos/ttl for ip tunnels") Signed-off-by: Paul Blakey Reviewed-by: Oz Shlomo Reviewed-by: Eli Cohen Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 120 +++++++++++++++++- 1 file changed, 116 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 5e454a14428f..9b3adaccc9be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1949,6 +1949,111 @@ u8 mlx5e_tc_get_ip_version(struct mlx5_flow_spec *spec, bool outer) return ip_version; } +/* Tunnel device follows RFC 6040, see include/net/inet_ecn.h. + * And changes inner ip_ecn depending on inner and outer ip_ecn as follows: + * +---------+----------------------------------------+ + * |Arriving | Arriving Outer Header | + * | Inner +---------+---------+---------+----------+ + * | Header | Not-ECT | ECT(0) | ECT(1) | CE | + * +---------+---------+---------+---------+----------+ + * | Not-ECT | Not-ECT | Not-ECT | Not-ECT | | + * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE* | + * | ECT(1) | ECT(1) | ECT(1) | ECT(1)* | CE* | + * | CE | CE | CE | CE | CE | + * +---------+---------+---------+---------+----------+ + * + * Tc matches on inner after decapsulation on tunnel device, but hw offload matches + * the inner ip_ecn value before hardware decap action. + * + * Cells marked are changed from original inner packet ip_ecn value during decap, and + * so matching those values on inner ip_ecn before decap will fail. + * + * The following helper allows offload when inner ip_ecn won't be changed by outer ip_ecn, + * except for the outer ip_ecn = CE, where in all cases inner ip_ecn will be changed to CE, + * and such we can drop the inner ip_ecn=CE match. + */ + +static int mlx5e_tc_verify_tunnel_ecn(struct mlx5e_priv *priv, + struct flow_cls_offload *f, + bool *match_inner_ecn) +{ + u8 outer_ecn_mask = 0, outer_ecn_key = 0, inner_ecn_mask = 0, inner_ecn_key = 0; + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_ip match; + + *match_inner_ecn = true; + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) { + flow_rule_match_enc_ip(rule, &match); + outer_ecn_key = match.key->tos & INET_ECN_MASK; + outer_ecn_mask = match.mask->tos & INET_ECN_MASK; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IP)) { + flow_rule_match_ip(rule, &match); + inner_ecn_key = match.key->tos & INET_ECN_MASK; + inner_ecn_mask = match.mask->tos & INET_ECN_MASK; + } + + if (outer_ecn_mask != 0 && outer_ecn_mask != INET_ECN_MASK) { + NL_SET_ERR_MSG_MOD(extack, "Partial match on enc_tos ecn bits isn't supported"); + netdev_warn(priv->netdev, "Partial match on enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (!outer_ecn_mask) { + if (!inner_ecn_mask) + return 0; + + NL_SET_ERR_MSG_MOD(extack, + "Matching on tos ecn bits without also matching enc_tos ecn bits isn't supported"); + netdev_warn(priv->netdev, + "Matching on tos ecn bits without also matching enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (inner_ecn_mask && inner_ecn_mask != INET_ECN_MASK) { + NL_SET_ERR_MSG_MOD(extack, + "Partial match on tos ecn bits with match on enc_tos ecn bits isn't supported"); + netdev_warn(priv->netdev, + "Partial match on tos ecn bits with match on enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (!inner_ecn_mask) + return 0; + + /* Both inner and outer have full mask on ecn */ + + if (outer_ecn_key == INET_ECN_ECT_1) { + /* inner ecn might change by DECAP action */ + + NL_SET_ERR_MSG_MOD(extack, "Match on enc_tos ecn = ECT(1) isn't supported"); + netdev_warn(priv->netdev, "Match on enc_tos ecn = ECT(1) isn't supported"); + return -EOPNOTSUPP; + } + + if (outer_ecn_key != INET_ECN_CE) + return 0; + + if (inner_ecn_key != INET_ECN_CE) { + /* Can't happen in software, as packet ecn will be changed to CE after decap */ + NL_SET_ERR_MSG_MOD(extack, + "Match on tos enc_tos ecn = CE while match on tos ecn != CE isn't supported"); + netdev_warn(priv->netdev, + "Match on tos enc_tos ecn = CE while match on tos ecn != CE isn't supported"); + return -EOPNOTSUPP; + } + + /* outer ecn = CE, inner ecn = CE, as decap will change inner ecn to CE in anycase, + * drop match on inner ecn + */ + *match_inner_ecn = false; + + return 0; +} + static int parse_tunnel_attr(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, @@ -2144,6 +2249,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, struct flow_rule *rule = flow_cls_offload_flow_rule(f); struct flow_dissector *dissector = rule->match.dissector; enum fs_flow_table_type fs_type; + bool match_inner_ecn = true; u16 addr_type = 0; u8 ip_proto = 0; u8 *match_level; @@ -2197,6 +2303,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, headers_c = get_match_inner_headers_criteria(spec); headers_v = get_match_inner_headers_value(spec); } + + err = mlx5e_tc_verify_tunnel_ecn(priv, f, &match_inner_ecn); + if (err) + return err; } err = mlx5e_flower_parse_meta(filter_dev, f); @@ -2420,10 +2530,12 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, struct flow_match_ip match; flow_rule_match_ip(rule, &match); - MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, - match.mask->tos & 0x3); - MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, - match.key->tos & 0x3); + if (match_inner_ecn) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, + match.mask->tos & 0x3); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, + match.key->tos & 0x3); + } MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_dscp, match.mask->tos >> 2); From a1c7c49c2091926962f8c1c866d386febffec5d8 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 30 Dec 2021 08:54:08 +0200 Subject: [PATCH 08/11] net/mlx5: Fix access to sf_dev_table on allocation failure Even when SF devices are supported, the SF device table allocation can still fail. In such case mlx5_sf_dev_supported still reports true, but SF device table is invalid. This can result in NULL table access. Hence, fix it by adding NULL table check. Fixes: 1958fc2f0712 ("net/mlx5: SF, Add auxiliary device driver") Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c index f37db7cc32a6..7da012ff0d41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -30,10 +30,7 @@ bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev) { struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; - if (!mlx5_sf_dev_supported(dev)) - return false; - - return !xa_empty(&table->devices); + return table && !xa_empty(&table->devices); } static ssize_t sfnum_show(struct device *dev, struct device_attribute *attr, char *buf) From 07f6dc4024ea1d2314b9c8b81fd4e492864fcca1 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Mon, 3 Jan 2022 15:04:18 +0200 Subject: [PATCH 09/11] net/mlx5e: Sync VXLAN udp ports during uplink representor profile change Currently during NIC profile disablement all VXLAN udp ports offloaded to the HW are flushed and during its enablement the driver send notification to the stack to inform the core that the entire UDP tunnel port state has been lost, uplink representor doesn't have the same behavior which can cause VXLAN udp ports offload to be in bad state while moving between modes while VXLAN interface exist. Fixed by aligning the uplink representor profile behavior to the NIC behavior. Fixes: 84db66124714 ("net/mlx5e: Move set vxlan nic info to profile init") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 48895d79796a..c0df4b1115b7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -50,6 +50,7 @@ #include "fs_core.h" #include "lib/mlx5.h" #include "lib/devcom.h" +#include "lib/vxlan.h" #define CREATE_TRACE_POINTS #include "diag/en_rep_tracepoint.h" #include "en_accel/ipsec.h" @@ -1027,6 +1028,7 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) rtnl_lock(); if (netif_running(netdev)) mlx5e_open(netdev); + udp_tunnel_nic_reset_ntf(priv->netdev); netif_device_attach(netdev); rtnl_unlock(); } @@ -1048,6 +1050,7 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) mlx5_notifier_unregister(mdev, &priv->events_nb); mlx5e_rep_tc_disable(priv); mlx5_lag_remove_netdev(mdev, priv->netdev); + mlx5_vxlan_reset_to_default(mdev->vxlan); } static MLX5E_DEFINE_STATS_GRP(sw_rep, 0); From 8e715cd613a1e872b9d918e912d90b399785761a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 5 Dec 2021 12:07:49 +0200 Subject: [PATCH 10/11] net/mlx5: Set command entry semaphore up once got index free Avoid a race where command work handler may fail to allocate command entry index, by holding the command semaphore down till command entry index is being freed. Fixes: 410bd754cd73 ("net/mlx5: Add retry mechanism to the command entry index allocation") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index a46284ca5172..f588503157d0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -148,8 +148,12 @@ static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) if (!refcount_dec_and_test(&ent->refcnt)) return; - if (ent->idx >= 0) - cmd_free_index(ent->cmd, ent->idx); + if (ent->idx >= 0) { + struct mlx5_cmd *cmd = ent->cmd; + + cmd_free_index(cmd, ent->idx); + up(ent->page_queue ? &cmd->pages_sem : &cmd->sem); + } cmd_free_ent(ent); } @@ -1602,8 +1606,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force vector = vec & 0xffffffff; for (i = 0; i < (1 << cmd->log_sz); i++) { if (test_bit(i, &vector)) { - struct semaphore *sem; - ent = cmd->ent_arr[i]; /* if we already completed the command, ignore it */ @@ -1626,10 +1628,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) cmd_ent_put(ent); - if (ent->page_queue) - sem = &cmd->pages_sem; - else - sem = &cmd->sem; ent->ts2 = ktime_get_ns(); memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out)); dump_command(dev, ent, 0); @@ -1683,7 +1681,6 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force */ complete(&ent->done); } - up(sem); } } } From 4f6626b0e140867fd6d5a2e9d4ceaef97f10f46a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 5 Dec 2021 11:20:59 +0200 Subject: [PATCH 11/11] Revert "net/mlx5: Add retry mechanism to the command entry index allocation" This reverts commit 410bd754cd73c4a2ac3856d9a03d7b08f9c906bf. The reverted commit had added a retry mechanism to the command entry index allocation. The previous patch ensures that there is a free command entry index once the command work handler holds the command semaphore. Thus the retry mechanism is not needed. Fixes: 410bd754cd73 ("net/mlx5: Add retry mechanism to the command entry index allocation") Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index f588503157d0..17fe05809653 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -904,25 +904,6 @@ static bool opcode_allowed(struct mlx5_cmd *cmd, u16 opcode) return cmd->allowed_opcode == opcode; } -static int cmd_alloc_index_retry(struct mlx5_cmd *cmd) -{ - unsigned long alloc_end = jiffies + msecs_to_jiffies(1000); - int idx; - -retry: - idx = cmd_alloc_index(cmd); - if (idx < 0 && time_before(jiffies, alloc_end)) { - /* Index allocation can fail on heavy load of commands. This is a temporary - * situation as the current command already holds the semaphore, meaning that - * another command completion is being handled and it is expected to release - * the entry index soon. - */ - cpu_relax(); - goto retry; - } - return idx; -} - bool mlx5_cmd_is_down(struct mlx5_core_dev *dev) { return pci_channel_offline(dev->pdev) || @@ -950,7 +931,7 @@ static void cmd_work_handler(struct work_struct *work) sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { - alloc_ret = cmd_alloc_index_retry(cmd); + alloc_ret = cmd_alloc_index(cmd); if (alloc_ret < 0) { mlx5_core_err_rl(dev, "failed to allocate command entry\n"); if (ent->callback) {