Files
linux/net/mptcp/mptcp_diag.c
Paolo Abeni ee458a3f31 mptcp: introduce mptcp-level backlog
We are soon using it for incoming data processing.
MPTCP can't leverage the sk_backlog, as the latter is processed
before the release callback, and such callback for MPTCP releases
and re-acquire the socket spinlock, breaking the sk_backlog processing
assumption.

Add a skb backlog list inside the mptcp sock struct, and implement
basic helper to transfer packet to and purge such list.

Packets in the backlog are memory accounted and still use the incoming
subflow receive memory, to allow back-pressure. The backlog size is
implicitly bounded to the sum of subflows rcvbuf.

When a subflow is closed, references from the backlog to such sock
are removed.

No packet is currently added to the backlog, so no functional changes
intended here.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-13-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2025-11-24 19:49:43 -08:00

244 lines
5.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* MPTCP socket monitoring support
*
* Copyright (c) 2020 Red Hat
*
* Author: Paolo Abeni <pabeni@redhat.com>
*/
#include <linux/kernel.h>
#include <linux/net.h>
#include <linux/inet_diag.h>
#include <net/netlink.h>
#include "protocol.h"
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
bool net_admin)
{
if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
net_admin);
}
static int mptcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
struct sk_buff *in_skb = cb->skb;
struct mptcp_sock *msk = NULL;
struct sk_buff *rep;
int err = -ENOENT;
struct net *net;
struct sock *sk;
net = sock_net(in_skb->sk);
msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]);
if (!msk)
goto out_nosk;
err = -ENOMEM;
sk = (struct sock *)msk;
rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
inet_diag_msg_attrs_size() +
nla_total_size(sizeof(struct mptcp_info)) +
nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
GFP_KERNEL);
if (!rep)
goto out;
err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
goto out;
}
err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
out:
sock_put(sk);
out_nosk:
return err;
}
struct mptcp_diag_ctx {
long s_slot;
long s_num;
unsigned int l_slot;
unsigned int l_num;
};
static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
bool net_admin)
{
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
struct inet_hashinfo *hinfo;
int i;
hinfo = net->ipv4.tcp_death_row.hashinfo;
for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk;
int num = 0;
ilb = &hinfo->lhash2[i];
rcu_read_lock();
spin_lock(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->nulls_head) {
const struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
struct inet_sock *inet = inet_sk(sk);
int ret;
if (num < diag_ctx->l_num)
goto next_listen;
if (!ctx || strcmp(inet_csk(sk)->icsk_ulp_ops->name, "mptcp"))
goto next_listen;
sk = ctx->conn;
if (!sk || !net_eq(sock_net(sk), net))
goto next_listen;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
goto next_listen;
if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_listen;
if (!refcount_inc_not_zero(&sk->sk_refcnt))
goto next_listen;
ret = sk_diag_dump(sk, skb, cb, r, net_admin);
sock_put(sk);
if (ret < 0) {
spin_unlock(&ilb->lock);
rcu_read_unlock();
diag_ctx->l_slot = i;
diag_ctx->l_num = num;
return;
}
diag_ctx->l_num = num + 1;
num = 0;
next_listen:
++num;
}
spin_unlock(&ilb->lock);
rcu_read_unlock();
cond_resched();
diag_ctx->l_num = 0;
}
diag_ctx->l_num = 0;
diag_ctx->l_slot = i;
}
static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
struct mptcp_sock *msk;
BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx));
while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot,
&diag_ctx->s_num)) != NULL) {
struct inet_sock *inet = (struct inet_sock *)msk;
struct sock *sk = (struct sock *)msk;
int ret = 0;
if (!(r->idiag_states & (1 << sk->sk_state)))
goto next;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
goto next;
if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next;
if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next;
ret = sk_diag_dump(sk, skb, cb, r, net_admin);
next:
sock_put(sk);
if (ret < 0) {
/* will retry on the same position */
diag_ctx->s_num--;
break;
}
cond_resched();
}
if ((r->idiag_states & TCPF_LISTEN) && r->id.idiag_dport == 0)
mptcp_diag_dump_listeners(skb, cb, r, net_admin);
}
static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_info *info = _info;
r->idiag_rqueue = sk_rmem_alloc_get(sk) +
READ_ONCE(mptcp_sk(sk)->backlog_len);
r->idiag_wqueue = sk_wmem_alloc_get(sk);
if (inet_sk_state_load(sk) == TCP_LISTEN) {
struct sock *lsk = READ_ONCE(msk->first);
if (lsk) {
/* override with settings from tcp listener,
* so Send-Q will show accept queue.
*/
r->idiag_rqueue = READ_ONCE(lsk->sk_ack_backlog);
r->idiag_wqueue = READ_ONCE(lsk->sk_max_ack_backlog);
}
}
if (!info)
return;
mptcp_diag_fill_info(msk, info);
}
static const struct inet_diag_handler mptcp_diag_handler = {
.owner = THIS_MODULE,
.dump = mptcp_diag_dump,
.dump_one = mptcp_diag_dump_one,
.idiag_get_info = mptcp_diag_get_info,
.idiag_type = IPPROTO_MPTCP,
.idiag_info_size = sizeof(struct mptcp_info),
};
static int __init mptcp_diag_init(void)
{
return inet_diag_register(&mptcp_diag_handler);
}
static void __exit mptcp_diag_exit(void)
{
inet_diag_unregister(&mptcp_diag_handler);
}
module_init(mptcp_diag_init);
module_exit(mptcp_diag_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MPTCP socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);