From 7ce5556f255a680d80daa31b1cedecf7f89e2c22 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:15 +0800 Subject: [PATCH 1/2] ipv6: flowlabel: take ip6_fl_lock across mem_check and fl_intern mem_check() in net/ipv6/ip6_flowlabel.c reads fl_size without holding ip6_fl_lock. fl_intern() takes the lock immediately afterwards. The two checks therefore race against concurrent fl_intern, ip6_fl_gc and ip6_fl_purge writers, which makes the mem_check budget check approximate. Move spin_lock_bh(&ip6_fl_lock) and the matching unlock from fl_intern() into its only caller ipv6_flowlabel_get(). The mem_check() call now runs under the same critical section as the fl_intern() insert, so the budget check is exact. With all writers and the read of fl_size under ip6_fl_lock, convert fl_size from atomic_t to plain int. The four sites that update or read fl_size are fl_intern (insert path), ip6_fl_gc (garbage collector, the !sched check and the per-entry decrement), ip6_fl_purge (per-netns purge), and mem_check (budget check), and all four now run under ip6_fl_lock. This is a prerequisite for adding a per-netns budget alongside fl_size. The follow-up patch adds netns_ipv6::flowlabel_count and folds it into mem_check(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-2-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_flowlabel.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c92f98c6f6ec..a8974643195a 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -40,7 +40,7 @@ #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) -static atomic_t fl_size = ATOMIC_INIT(0); +static int fl_size; static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); @@ -163,7 +163,7 @@ static void ip6_fl_gc(struct timer_list *unused) if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; continue; } if (!sched || time_before(ttd, sched)) @@ -172,7 +172,7 @@ static void ip6_fl_gc(struct timer_list *unused) flp = &fl->next; } } - if (!sched && atomic_read(&fl_size)) + if (!sched && fl_size) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); @@ -196,7 +196,7 @@ static void __net_exit ip6_fl_purge(struct net *net) atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; continue; } flp = &fl->next; @@ -210,10 +210,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net, { struct ip6_flowlabel *lfl; + lockdep_assert_held(&ip6_fl_lock); + fl->label = label & IPV6_FLOWLABEL_MASK; - rcu_read_lock(); - spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; @@ -235,8 +235,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); return lfl; } } @@ -244,9 +242,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); - atomic_inc(&fl_size); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); + fl_size++; return NULL; } @@ -464,10 +460,14 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, static int mem_check(struct sock *sk) { - int room = FL_MAX_SIZE - atomic_read(&fl_size); + int room; struct ipv6_fl_socklist *sfl; int count = 0; + lockdep_assert_held(&ip6_fl_lock); + + room = FL_MAX_SIZE - fl_size; + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; @@ -692,11 +692,19 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (!sfl1) goto done; + rcu_read_lock(); + spin_lock_bh(&ip6_fl_lock); err = mem_check(sk); + if (err == 0) + fl1 = fl_intern(net, fl, freq->flr_label); + else + fl1 = NULL; + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); + if (err != 0) goto done; - fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; From e68eadffb724b36ffd3d5619e0efcaf29ec2a175 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:16 +0800 Subject: [PATCH 2/2] ipv6: flowlabel: enforce per-netns limit for unprivileged callers fl_size, fl_ht and ip6_fl_lock in net/ipv6/ip6_flowlabel.c are file scope and shared across netns. mem_check() reads fl_size to decide whether to deny non-CAP_NET_ADMIN callers. capable() runs against init_user_ns, so an unprivileged user in any non-init userns can push fl_size past FL_MAX_SIZE - FL_MAX_SIZE / 4 and starve every other unprivileged userns on the host. Add struct netns_ipv6::flowlabel_count, bumped and decremented next to fl_size in fl_intern, ip6_fl_gc and ip6_fl_purge. The new field fills the existing 4-byte hole after ipmr_seq, so struct netns_ipv6 stays the same size on 64-bit builds. Bump FL_MAX_SIZE from 4096 to 8192. It has been 4096 since the file was added. Machines and connection counts have grown. mem_check() folds an extra per-netns ceiling into the existing non-CAP_NET_ADMIN conditional. The ceiling is half of the total budget that unprivileged callers have ever been able to use, i.e. (FL_MAX_SIZE - FL_MAX_SIZE / 4) / 2 = 3072 entries. With FL_MAX_SIZE doubled, this preserves the original per-user reach of 3K (what an unprivileged caller could already obtain before this change), while forcing an attacker to spread allocations across at least two netns to exhaust the global non-CAP_NET_ADMIN budget. CAP_NET_ADMIN against init_user_ns still bypasses both caps. The previous patch took ip6_fl_lock across mem_check and fl_intern, so the new flowlabel_count read in mem_check and the new flowlabel_count++ in fl_intern run under the same critical section. flowlabel_count is therefore plain int, like fl_size. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-3-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6_flowlabel.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 499e4288170f..875916d60bfe 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -119,6 +119,7 @@ struct netns_ipv6 { struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; atomic_t ipmr_seq; + int flowlabel_count; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a8974643195a..b1ccdf0dc646 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -36,7 +36,7 @@ /* FL hash table */ #define FL_MAX_PER_SOCK 32 -#define FL_MAX_SIZE 4096 +#define FL_MAX_SIZE 8192 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) @@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused) ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; - fl_free(fl); fl_size--; + fl->fl_net->ipv6.flowlabel_count--; + fl_free(fl); continue; } if (!sched || time_before(ttd, sched)) @@ -197,6 +198,7 @@ static void __net_exit ip6_fl_purge(struct net *net) *flp = fl->next; fl_free(fl); fl_size--; + net->ipv6.flowlabel_count--; continue; } flp = &fl->next; @@ -243,6 +245,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); fl_size++; + net->ipv6.flowlabel_count++; return NULL; } @@ -460,6 +463,9 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, static int mem_check(struct sock *sk) { + const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4); + const int unpriv_user_limit = unpriv_total_limit / 2; + struct net *net = sock_net(sk); int room; struct ipv6_fl_socklist *sfl; int count = 0; @@ -478,7 +484,9 @@ static int mem_check(struct sock *sk) if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + (count > 0 && room < FL_MAX_SIZE / 2) || + room < FL_MAX_SIZE / 4 || + net->ipv6.flowlabel_count >= unpriv_user_limit) && !capable(CAP_NET_ADMIN))) return -ENOBUFS;