From 4914109a8e1e494c6aa9852f9e84ec77a5fc643f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Jul 2023 17:09:17 -0400 Subject: [PATCH 1/3] netfilter: allow exp not to be removed in nf_ct_find_expectation Currently nf_conntrack_in() calling nf_ct_find_expectation() will remove the exp from the hash table. However, in some scenario, we expect the exp not to be removed when the created ct will not be confirmed, like in OVS and TC conntrack in the following patches. This patch allows exp not to be removed by setting IPS_CONFIRMED in the status of the tmpl. Signed-off-by: Xin Long Acked-by: Aaron Conole Acked-by: Florian Westphal Signed-off-by: Paolo Abeni --- include/net/netfilter/nf_conntrack_expect.h | 2 +- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_expect.c | 4 ++-- net/netfilter/nft_ct.c | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cf0d81be5a96..165e7a03b8e9 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -100,7 +100,7 @@ nf_ct_expect_find_get(struct net *net, struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 992393102d5f..9f6f2e643575 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1756,7 +1756,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 96948e98ec53..81ca348915c9 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; @@ -211,7 +211,7 @@ nf_ct_find_expectation(struct net *net, !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 38958e067aa8..e87fd4314c68 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -262,6 +262,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); @@ -368,6 +369,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } From 76622ced50a103e278ad560566b5e43d82f718c6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Jul 2023 17:09:18 -0400 Subject: [PATCH 2/3] net: sched: set IPS_CONFIRMED in tmpl status only when commit is set in act_ct With the following flows, the packets will be dropped if OVS TC offload is enabled. 'ip,ct_state=-trk,in_port=1 actions=ct(zone=1)' 'ip,ct_state=+trk+new+rel,in_port=1 actions=ct(commit,zone=1)' 'ip,ct_state=+trk+new+rel,in_port=1 actions=ct(commit,zone=2),normal' In the 1st flow, it finds the exp from the hashtable and removes it then creates the ct with this exp in act_ct. However, in the 2nd flow it goes to the OVS upcall at the 1st time. When the skb comes back from userspace, it has to create the ct again without exp(the exp was removed last time). With no 'rel' set in the ct, the 3rd flow can never get matched. In OVS conntrack, it works around it by adding its own exp lookup function ovs_ct_expect_find() where it doesn't remove the exp. Instead of creating a real ct, it only updates its keys with the exp and its master info. So when the skb comes back, the exp is still in the hashtable. However, we can't do this trick in act_ct, as tc flower match is using a real ct, and passing the exp and its master info to flower parsing via tc_skb_cb is also not possible (tc_skb_cb size is not big enough). The simple and clear fix is to not remove the exp at the 1st flow, namely, not set IPS_CONFIRMED in tmpl when commit is not set in act_ct. Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Aaron Conole Reviewed-by: Davide Caratti Acked-by: Florian Westphal Signed-off-by: Paolo Abeni --- net/sched/act_ct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index abc71a06d634..7c652d14528b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1238,7 +1238,8 @@ static int tcf_ct_fill_params(struct net *net, } } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + if (p->ct_action & TCA_CT_ACT_COMMIT) + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; err: nf_ct_put(p->tmpl); From 8c8b733208058702da451b7d60a12c0ff90b6879 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Jul 2023 17:09:19 -0400 Subject: [PATCH 3/3] openvswitch: set IPS_CONFIRMED in tmpl status only when commit is set in conntrack By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed from the hashtable when lookup, we can simplify the exp processing code a lot in openvswitch conntrack. Signed-off-by: Xin Long Acked-by: Aaron Conole Acked-by: Florian Westphal Signed-off-by: Paolo Abeni --- net/openvswitch/conntrack.c | 78 +++++-------------------------------- 1 file changed, 10 insertions(+), 68 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 331730fd3580..fa955e892210 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, return 0; } -static struct nf_conntrack_expect * -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, - u16 proto, const struct sk_buff *skb) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_expect *exp; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) - return NULL; - - exp = __nf_ct_expect_find(net, zone, &tuple); - if (exp) { - struct nf_conntrack_tuple_hash *h; - - /* Delete existing conntrack entry, if it clashes with the - * expectation. This can happen since conntrack ALGs do not - * check for clashes between (new) expectations and existing - * conntrack entries. nf_conntrack_in() will check the - * expectations only if a conntrack entry can not be found, - * which can lead to OVS finding the expectation (here) in the - * init direction, but which will not be removed by the - * nf_conntrack_in() call, if a matching conntrack entry is - * found instead. In this case all init direction packets - * would be reported as new related packets, while reply - * direction packets would be reported as un-related - * established packets. - */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (h) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - nf_ct_delete(ct, 0, 0); - nf_ct_put(ct); - } - } - - return exp; -} - /* This replicates logic from nf_conntrack_core.c that is not exported. */ static enum ip_conntrack_info ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - struct nf_conntrack_expect *exp; + struct nf_conn *ct; + int err; - /* If we pass an expected packet through nf_conntrack_in() the - * expectation is typically removed, but the packet could still be - * lost in upcall processing. To prevent this from happening we - * perform an explicit expectation lookup. Expected connections are - * always new, and will be passed through conntrack only when they are - * committed, as it is OK to remove the expectation at that time. - */ - exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); - if (exp) { - u8 state; + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; - /* NOTE: New connections are NATted and Helped only when - * committed, so we are not calling into NAT here. - */ - state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - struct nf_conn *ct; - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - - ct = (struct nf_conn *)skb_nfct(skb); - if (ct) - nf_ct_deliver_cached_events(ct); - } + ct = (struct nf_conn *)skb_nfct(skb); + if (ct) + nf_ct_deliver_cached_events(ct); return 0; } @@ -1460,7 +1401,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + if (ct_info.commit) + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); return 0; err_free_ct: __ovs_ct_free_action(&ct_info);