mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 11:21:26 -04:00
Merge branch 'bpf-fix-fionread-and-copied_seq-issues'
Jiayuan Chen says:
====================
bpf: Fix FIONREAD and copied_seq issues
syzkaller reported a bug [1] where a socket using sockmap, after being
unloaded, exposed incorrect copied_seq calculation. The selftest I
provided can be used to reproduce the issue reported by syzkaller.
TCP recvmsg seq # bug 2: copied E92C873, seq E68D125, rcvnxt E7CEB7C, fl 40
WARNING: CPU: 1 PID: 5997 at net/ipv4/tcp.c:2724 tcp_recvmsg_locked+0xb2f/0x2910 net/ipv4/tcp.c:2724
Call Trace:
<TASK>
receive_fallback_to_copy net/ipv4/tcp.c:1968 [inline]
tcp_zerocopy_receive+0x131a/0x2120 net/ipv4/tcp.c:2200
do_tcp_getsockopt+0xe28/0x26c0 net/ipv4/tcp.c:4713
tcp_getsockopt+0xdf/0x100 net/ipv4/tcp.c:4812
do_sock_getsockopt+0x34d/0x440 net/socket.c:2421
__sys_getsockopt+0x12f/0x260 net/socket.c:2450
__do_sys_getsockopt net/socket.c:2457 [inline]
__se_sys_getsockopt net/socket.c:2454 [inline]
__x64_sys_getsockopt+0xbd/0x160 net/socket.c:2454
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xcd/0xfa0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
A sockmap socket maintains its own receive queue (ingress_msg) which may
contain data from either its own protocol stack or forwarded from other
sockets.
FD1:read()
-- FD1->copied_seq++
| [read data]
|
[enqueue data] v
[sockmap] -> ingress to self -> ingress_msg queue
FD1 native stack ------> ^
-- FD1->rcv_nxt++ -> redirect to other | [enqueue data]
| |
| ingress to FD1
v ^
... | [sockmap]
FD2 native stack
The issue occurs when reading from ingress_msg: we update tp->copied_seq
by default, but if the data comes from other sockets (not the socket's
own protocol stack), tcp->rcv_nxt remains unchanged. Later, when
converting back to a native socket, reads may fail as copied_seq could
be significantly larger than rcv_nxt.
Additionally, FIONREAD calculation based on copied_seq and rcv_nxt is
insufficient for sockmap sockets, requiring separate field tracking.
[1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983
---
v7 -> v9: Address Jakub Sitnicki's feedback:
- Remove sk_receive_queue check in tcp_bpf_ioctl, only report
ingress_msg data length for FIONREAD
- Minor nits fixes
- Add Reviewed-by tag from John Fastabend
- Fix ci error
https://lore.kernel.org/bpf/20260113025121.197535-1-jiayuan.chen@linux.dev/
v5 -> v7: Some modifications suggested by Jakub Sitnicki, and added Reviewed-by tag.
https://lore.kernel.org/bpf/20260106051458.279151-1-jiayuan.chen@linux.dev/
v1 -> v5: Use skmsg.sk instead of extending BPF_F_XXX macro and fix CI
failure reported by CI
v1: https://lore.kernel.org/bpf/20251117110736.293040-1-jiayuan.chen@linux.dev/
====================
Link: https://patch.msgid.link/20260124113314.113584-1-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
@@ -97,6 +97,8 @@ struct sk_psock {
|
||||
struct sk_buff_head ingress_skb;
|
||||
struct list_head ingress_msg;
|
||||
spinlock_t ingress_lock;
|
||||
/** @msg_tot_len: Total bytes queued in ingress_msg list. */
|
||||
u32 msg_tot_len;
|
||||
unsigned long state;
|
||||
struct list_head link;
|
||||
spinlock_t link_lock;
|
||||
@@ -141,6 +143,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
||||
struct sk_msg *msg, u32 bytes);
|
||||
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags);
|
||||
int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags, int *copied_from_self);
|
||||
bool sk_msg_is_readable(struct sock *sk);
|
||||
|
||||
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
|
||||
@@ -319,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
|
||||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock)
|
||||
{
|
||||
/* Used by ioctl to read msg_tot_len only; lock-free for performance */
|
||||
return READ_ONCE(psock->msg_tot_len);
|
||||
}
|
||||
|
||||
static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff)
|
||||
{
|
||||
/* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock().
|
||||
* ingress_lock should be held to prevent concurrent updates to msg_tot_len
|
||||
*/
|
||||
WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff);
|
||||
}
|
||||
|
||||
static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff)
|
||||
{
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
sk_psock_msg_len_add_locked(psock, diff);
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
}
|
||||
|
||||
static inline bool sk_psock_queue_msg(struct sk_psock *psock,
|
||||
struct sk_msg *msg)
|
||||
{
|
||||
@@ -327,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
|
||||
list_add_tail(&msg->list, &psock->ingress_msg);
|
||||
sk_psock_msg_len_add_locked(psock, msg->sg.size);
|
||||
ret = true;
|
||||
} else {
|
||||
sk_msg_free(psock->sk, msg);
|
||||
@@ -343,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
|
||||
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
|
||||
if (msg)
|
||||
if (msg) {
|
||||
list_del(&msg->list);
|
||||
sk_psock_msg_len_add_locked(psock, -msg->sg.size);
|
||||
}
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
return msg;
|
||||
}
|
||||
|
||||
static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock)
|
||||
{
|
||||
return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
|
||||
}
|
||||
|
||||
static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
|
||||
{
|
||||
struct sk_msg *msg;
|
||||
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
|
||||
msg = sk_psock_peek_msg_locked(psock);
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
return msg;
|
||||
}
|
||||
@@ -521,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
|
||||
return !!psock->saved_data_ready;
|
||||
}
|
||||
|
||||
/* for tcp only, sk is locked */
|
||||
static inline ssize_t sk_psock_msg_inq(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
ssize_t inq = 0;
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (likely(psock)) {
|
||||
inq = sk_psock_get_msg_len_nolock(psock);
|
||||
sk_psock_put(sk, psock);
|
||||
}
|
||||
return inq;
|
||||
}
|
||||
|
||||
/* for udp only, sk is not locked */
|
||||
static inline ssize_t sk_msg_first_len(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
struct sk_msg *msg;
|
||||
ssize_t inq = 0;
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (likely(psock)) {
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
msg = sk_psock_peek_msg_locked(psock);
|
||||
if (msg)
|
||||
inq = msg->sg.size;
|
||||
spin_unlock_bh(&psock->ingress_lock);
|
||||
sk_psock_put(sk, psock);
|
||||
}
|
||||
return inq;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
|
||||
|
||||
#define BPF_F_STRPARSER (1UL << 1)
|
||||
|
||||
@@ -409,22 +409,26 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
|
||||
|
||||
/* Receive sk_msg from psock->ingress_msg to @msg. */
|
||||
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags)
|
||||
int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags, int *copied_from_self)
|
||||
{
|
||||
struct iov_iter *iter = &msg->msg_iter;
|
||||
int peek = flags & MSG_PEEK;
|
||||
struct sk_msg *msg_rx;
|
||||
int i, copied = 0;
|
||||
bool from_self;
|
||||
|
||||
msg_rx = sk_psock_peek_msg(psock);
|
||||
if (copied_from_self)
|
||||
*copied_from_self = 0;
|
||||
|
||||
while (copied != len) {
|
||||
struct scatterlist *sge;
|
||||
|
||||
if (unlikely(!msg_rx))
|
||||
break;
|
||||
|
||||
from_self = msg_rx->sk == sk;
|
||||
i = msg_rx->sg.start;
|
||||
do {
|
||||
struct page *page;
|
||||
@@ -443,6 +447,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
}
|
||||
|
||||
copied += copy;
|
||||
if (from_self && copied_from_self)
|
||||
*copied_from_self += copy;
|
||||
|
||||
if (likely(!peek)) {
|
||||
sge->offset += copy;
|
||||
sge->length -= copy;
|
||||
@@ -451,6 +458,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
atomic_sub(copy, &sk->sk_rmem_alloc);
|
||||
}
|
||||
msg_rx->sg.size -= copy;
|
||||
sk_psock_msg_len_add(psock, -copy);
|
||||
|
||||
if (!sge->length) {
|
||||
sk_msg_iter_var_next(i);
|
||||
@@ -487,6 +495,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
out:
|
||||
return copied;
|
||||
}
|
||||
|
||||
/* Receive sk_msg from psock->ingress_msg to @msg. */
|
||||
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
|
||||
int len, int flags)
|
||||
{
|
||||
return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
|
||||
|
||||
bool sk_msg_is_readable(struct sock *sk)
|
||||
@@ -616,6 +631,12 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
|
||||
if (unlikely(!msg))
|
||||
return -EAGAIN;
|
||||
skb_set_owner_r(skb, sk);
|
||||
|
||||
/* This is used in tcp_bpf_recvmsg_parser() to determine whether the
|
||||
* data originates from the socket's own protocol stack. No need to
|
||||
* refcount sk because msg's lifetime is bound to sk via the ingress_msg.
|
||||
*/
|
||||
msg->sk = sk;
|
||||
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
|
||||
if (err < 0)
|
||||
kfree(msg);
|
||||
@@ -801,9 +822,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
|
||||
list_del(&msg->list);
|
||||
if (!msg->skb)
|
||||
atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
|
||||
sk_psock_msg_len_add(psock, -msg->sg.size);
|
||||
sk_msg_free(psock->sk, msg);
|
||||
kfree(msg);
|
||||
}
|
||||
WARN_ON_ONCE(psock->msg_tot_len);
|
||||
}
|
||||
|
||||
static void __sk_psock_zap_ingress(struct sk_psock *psock)
|
||||
@@ -909,6 +932,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
||||
sk_msg_compute_data_pointers(msg);
|
||||
msg->sk = sk;
|
||||
ret = bpf_prog_run_pin_on_cpu(prog, msg);
|
||||
msg->sk = NULL;
|
||||
ret = sk_psock_map_verd(ret, msg->sk_redir);
|
||||
psock->apply_bytes = msg->apply_bytes;
|
||||
if (ret == __SK_REDIRECT) {
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include <net/inet_common.h>
|
||||
#include <net/tls.h>
|
||||
#include <asm/ioctls.h>
|
||||
|
||||
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
@@ -226,6 +227,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
|
||||
int peek = flags & MSG_PEEK;
|
||||
struct sk_psock *psock;
|
||||
struct tcp_sock *tcp;
|
||||
int copied_from_self = 0;
|
||||
int copied = 0;
|
||||
u32 seq;
|
||||
|
||||
@@ -262,7 +264,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
|
||||
}
|
||||
|
||||
msg_bytes_ready:
|
||||
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
|
||||
copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self);
|
||||
/* The typical case for EFAULT is the socket was gracefully
|
||||
* shutdown with a FIN pkt. So check here the other case is
|
||||
* some error on copy_page_to_iter which would be unexpected.
|
||||
@@ -277,7 +279,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
seq += copied;
|
||||
seq += copied_from_self;
|
||||
if (!copied) {
|
||||
long timeo;
|
||||
int data;
|
||||
@@ -331,6 +333,24 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
|
||||
return copied;
|
||||
}
|
||||
|
||||
static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
|
||||
{
|
||||
bool slow;
|
||||
|
||||
if (cmd != SIOCINQ)
|
||||
return tcp_ioctl(sk, cmd, karg);
|
||||
|
||||
/* works similar as tcp_ioctl */
|
||||
if (sk->sk_state == TCP_LISTEN)
|
||||
return -EINVAL;
|
||||
|
||||
slow = lock_sock_fast(sk);
|
||||
*karg = sk_psock_msg_inq(sk);
|
||||
unlock_sock_fast(sk, slow);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
||||
int flags, int *addr_len)
|
||||
{
|
||||
@@ -609,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
|
||||
prot[TCP_BPF_BASE].close = sock_map_close;
|
||||
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
|
||||
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
|
||||
prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl;
|
||||
|
||||
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
|
||||
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <net/sock.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/inet_common.h>
|
||||
#include <asm/ioctls.h>
|
||||
|
||||
#include "udp_impl.h"
|
||||
|
||||
@@ -111,12 +112,26 @@ enum {
|
||||
static DEFINE_SPINLOCK(udpv6_prot_lock);
|
||||
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
|
||||
|
||||
static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
|
||||
{
|
||||
if (cmd != SIOCINQ)
|
||||
return udp_ioctl(sk, cmd, karg);
|
||||
|
||||
/* Since we don't hold a lock, sk_receive_queue may contain data.
|
||||
* BPF might only be processing this data at the moment. We only
|
||||
* care about the data in the ingress_msg here.
|
||||
*/
|
||||
*karg = sk_msg_first_len(sk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
|
||||
{
|
||||
*prot = *base;
|
||||
prot->close = sock_map_close;
|
||||
prot->recvmsg = udp_bpf_recvmsg;
|
||||
prot->sock_is_readable = sk_msg_is_readable;
|
||||
*prot = *base;
|
||||
prot->close = sock_map_close;
|
||||
prot->recvmsg = udp_bpf_recvmsg;
|
||||
prot->sock_is_readable = sk_msg_is_readable;
|
||||
prot->ioctl = udp_bpf_ioctl;
|
||||
}
|
||||
|
||||
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// Copyright (c) 2020 Cloudflare
|
||||
#include <error.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/socket.h>
|
||||
#include <sys/epoll.h>
|
||||
|
||||
#include "test_progs.h"
|
||||
@@ -22,6 +23,15 @@
|
||||
#define TCP_REPAIR_ON 1
|
||||
#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
|
||||
|
||||
/**
|
||||
* SOL_TCP is defined in <netinet/tcp.h> (glibc), but the copybuf_address
|
||||
* field of tcp_zerocopy_receive is not yet included in older versions.
|
||||
* This workaround remains necessary until the glibc update propagates.
|
||||
*/
|
||||
#ifndef SOL_TCP
|
||||
#define SOL_TCP 6
|
||||
#endif
|
||||
|
||||
static int connected_socket_v4(void)
|
||||
{
|
||||
struct sockaddr_in addr = {
|
||||
@@ -536,13 +546,14 @@ static void test_sockmap_skb_verdict_shutdown(void)
|
||||
}
|
||||
|
||||
|
||||
static void test_sockmap_skb_verdict_fionread(bool pass_prog)
|
||||
static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog)
|
||||
{
|
||||
int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
|
||||
int expected, zero = 0, sent, recvd, avail;
|
||||
struct test_sockmap_pass_prog *pass = NULL;
|
||||
struct test_sockmap_drop_prog *drop = NULL;
|
||||
char buf[256] = "0123456789";
|
||||
int split_len = sizeof(buf) / 2;
|
||||
|
||||
if (pass_prog) {
|
||||
pass = test_sockmap_pass_prog__open_and_load();
|
||||
@@ -550,7 +561,10 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
|
||||
return;
|
||||
verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
|
||||
map = bpf_map__fd(pass->maps.sock_map_rx);
|
||||
expected = sizeof(buf);
|
||||
if (sotype == SOCK_DGRAM)
|
||||
expected = split_len; /* FIONREAD for UDP is different from TCP */
|
||||
else
|
||||
expected = sizeof(buf);
|
||||
} else {
|
||||
drop = test_sockmap_drop_prog__open_and_load();
|
||||
if (!ASSERT_OK_PTR(drop, "open_and_load"))
|
||||
@@ -566,7 +580,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
|
||||
if (!ASSERT_OK(err, "bpf_prog_attach"))
|
||||
goto out;
|
||||
|
||||
err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1);
|
||||
err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1);
|
||||
if (!ASSERT_OK(err, "create_socket_pairs()"))
|
||||
goto out;
|
||||
|
||||
@@ -574,8 +588,9 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
|
||||
goto out_close;
|
||||
|
||||
sent = xsend(p1, &buf, sizeof(buf), 0);
|
||||
ASSERT_EQ(sent, sizeof(buf), "xsend(p0)");
|
||||
sent = xsend(p1, &buf, split_len, 0);
|
||||
sent += xsend(p1, &buf, sizeof(buf) - split_len, 0);
|
||||
ASSERT_EQ(sent, sizeof(buf), "xsend(p1)");
|
||||
err = ioctl(c1, FIONREAD, &avail);
|
||||
ASSERT_OK(err, "ioctl(FIONREAD) error");
|
||||
ASSERT_EQ(avail, expected, "ioctl(FIONREAD)");
|
||||
@@ -597,6 +612,12 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
|
||||
test_sockmap_drop_prog__destroy(drop);
|
||||
}
|
||||
|
||||
static void test_sockmap_skb_verdict_fionread(bool pass_prog)
|
||||
{
|
||||
do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog);
|
||||
do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog);
|
||||
}
|
||||
|
||||
static void test_sockmap_skb_verdict_change_tail(void)
|
||||
{
|
||||
struct test_sockmap_change_tail *skel;
|
||||
@@ -1042,6 +1063,257 @@ static void test_sockmap_vsock_unconnected(void)
|
||||
xclose(map);
|
||||
}
|
||||
|
||||
/* it is used to reproduce WARNING */
|
||||
static void test_sockmap_zc(void)
|
||||
{
|
||||
int map, err, sent, recvd, zero = 0, one = 1, on = 1;
|
||||
char buf[10] = "0123456789", rcv[11], addr[100];
|
||||
struct test_sockmap_pass_prog *skel = NULL;
|
||||
int c0 = -1, p0 = -1, c1 = -1, p1 = -1;
|
||||
struct tcp_zerocopy_receive zc;
|
||||
socklen_t zc_len = sizeof(zc);
|
||||
struct bpf_program *prog;
|
||||
|
||||
skel = test_sockmap_pass_prog__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_and_load"))
|
||||
return;
|
||||
|
||||
if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1))
|
||||
goto end;
|
||||
|
||||
prog = skel->progs.prog_skb_verdict_ingress;
|
||||
map = bpf_map__fd(skel->maps.sock_map_rx);
|
||||
|
||||
err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
|
||||
if (!ASSERT_OK(err, "bpf_prog_attach"))
|
||||
goto end;
|
||||
|
||||
err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY);
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem"))
|
||||
goto end;
|
||||
|
||||
err = bpf_map_update_elem(map, &one, &p1, BPF_ANY);
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem"))
|
||||
goto end;
|
||||
|
||||
sent = xsend(c0, buf, sizeof(buf), 0);
|
||||
if (!ASSERT_EQ(sent, sizeof(buf), "xsend"))
|
||||
goto end;
|
||||
|
||||
/* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */
|
||||
recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1);
|
||||
if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)"))
|
||||
goto end;
|
||||
|
||||
/* uninstall sockmap of p1 */
|
||||
bpf_map_delete_elem(map, &one);
|
||||
|
||||
/* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */
|
||||
sent = xsend(c1, buf, sizeof(buf) - 1, 0);
|
||||
if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend"))
|
||||
goto end;
|
||||
|
||||
err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on));
|
||||
if (!ASSERT_OK(err, "setsockopt"))
|
||||
goto end;
|
||||
|
||||
memset(&zc, 0, sizeof(zc));
|
||||
zc.copybuf_address = (__u64)((unsigned long)addr);
|
||||
zc.copybuf_len = sizeof(addr);
|
||||
|
||||
err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len);
|
||||
if (!ASSERT_OK(err, "getsockopt"))
|
||||
goto end;
|
||||
|
||||
end:
|
||||
if (c0 >= 0)
|
||||
close(c0);
|
||||
if (p0 >= 0)
|
||||
close(p0);
|
||||
if (c1 >= 0)
|
||||
close(c1);
|
||||
if (p1 >= 0)
|
||||
close(p1);
|
||||
test_sockmap_pass_prog__destroy(skel);
|
||||
}
|
||||
|
||||
/* it is used to check whether copied_seq of sk is correct */
|
||||
static void test_sockmap_copied_seq(bool strp)
|
||||
{
|
||||
int i, map, err, sent, recvd, zero = 0, one = 1;
|
||||
struct test_sockmap_pass_prog *skel = NULL;
|
||||
int c0 = -1, p0 = -1, c1 = -1, p1 = -1;
|
||||
char buf[10] = "0123456789", rcv[11];
|
||||
struct bpf_program *prog;
|
||||
|
||||
skel = test_sockmap_pass_prog__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_and_load"))
|
||||
return;
|
||||
|
||||
if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1))
|
||||
goto end;
|
||||
|
||||
prog = skel->progs.prog_skb_verdict_ingress;
|
||||
map = bpf_map__fd(skel->maps.sock_map_rx);
|
||||
|
||||
err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
|
||||
if (!ASSERT_OK(err, "bpf_prog_attach verdict"))
|
||||
goto end;
|
||||
|
||||
if (strp) {
|
||||
prog = skel->progs.prog_skb_verdict_ingress_strp;
|
||||
err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0);
|
||||
if (!ASSERT_OK(err, "bpf_prog_attach parser"))
|
||||
goto end;
|
||||
}
|
||||
|
||||
err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY);
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem(p0)"))
|
||||
goto end;
|
||||
|
||||
err = bpf_map_update_elem(map, &one, &p1, BPF_ANY);
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem(p1)"))
|
||||
goto end;
|
||||
|
||||
/* just trigger sockamp: data sent by c0 will be received by p1 */
|
||||
sent = xsend(c0, buf, sizeof(buf), 0);
|
||||
if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf"))
|
||||
goto end;
|
||||
|
||||
/* do partial read */
|
||||
recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1);
|
||||
recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1);
|
||||
if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") ||
|
||||
!ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch"))
|
||||
goto end;
|
||||
|
||||
/* uninstall sockmap of p1 and p0 */
|
||||
err = bpf_map_delete_elem(map, &one);
|
||||
if (!ASSERT_OK(err, "bpf_map_delete_elem(1)"))
|
||||
goto end;
|
||||
|
||||
err = bpf_map_delete_elem(map, &zero);
|
||||
if (!ASSERT_OK(err, "bpf_map_delete_elem(0)"))
|
||||
goto end;
|
||||
|
||||
/* now all sockets become plain socket, they should still work */
|
||||
for (i = 0; i < 5; i++) {
|
||||
/* test copied_seq of p1 by running tcp native stack */
|
||||
sent = xsend(c1, buf, sizeof(buf), 0);
|
||||
if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native"))
|
||||
goto end;
|
||||
|
||||
recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT);
|
||||
if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native"))
|
||||
goto end;
|
||||
|
||||
/* p0 previously redirected skb to p1, we also check copied_seq of p0 */
|
||||
sent = xsend(c0, buf, sizeof(buf), 0);
|
||||
if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native"))
|
||||
goto end;
|
||||
|
||||
recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT);
|
||||
if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native"))
|
||||
goto end;
|
||||
}
|
||||
|
||||
end:
|
||||
if (c0 >= 0)
|
||||
close(c0);
|
||||
if (p0 >= 0)
|
||||
close(p0);
|
||||
if (c1 >= 0)
|
||||
close(c1);
|
||||
if (p1 >= 0)
|
||||
close(p1);
|
||||
test_sockmap_pass_prog__destroy(skel);
|
||||
}
|
||||
|
||||
/* Wait until FIONREAD returns the expected value or timeout */
|
||||
static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms)
|
||||
{
|
||||
unsigned int elapsed = 0;
|
||||
int avail = 0;
|
||||
|
||||
while (elapsed < timeout_ms) {
|
||||
if (ioctl(fd, FIONREAD, &avail) < 0)
|
||||
return -errno;
|
||||
if (avail >= expected)
|
||||
return avail;
|
||||
usleep(1000);
|
||||
elapsed++;
|
||||
}
|
||||
return avail;
|
||||
}
|
||||
|
||||
/* it is used to send data to via native stack and BPF redirecting */
|
||||
static void test_sockmap_multi_channels(int sotype)
|
||||
{
|
||||
int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected;
|
||||
struct test_sockmap_pass_prog *skel = NULL;
|
||||
int c0 = -1, p0 = -1, c1 = -1, p1 = -1;
|
||||
char buf[10] = "0123456789", rcv[11];
|
||||
struct bpf_program *prog;
|
||||
|
||||
skel = test_sockmap_pass_prog__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_and_load"))
|
||||
return;
|
||||
|
||||
err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1);
|
||||
if (err)
|
||||
goto end;
|
||||
|
||||
prog = skel->progs.prog_skb_verdict_ingress;
|
||||
map = bpf_map__fd(skel->maps.sock_map_rx);
|
||||
|
||||
err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
|
||||
if (!ASSERT_OK(err, "bpf_prog_attach verdict"))
|
||||
goto end;
|
||||
|
||||
err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY);
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem(p0)"))
|
||||
goto end;
|
||||
|
||||
err = bpf_map_update_elem(map, &one, &p1, BPF_ANY);
|
||||
if (!ASSERT_OK(err, "bpf_map_update_elem"))
|
||||
goto end;
|
||||
|
||||
/* send data to p1 via native stack */
|
||||
sent = xsend(c1, buf, 2, 0);
|
||||
if (!ASSERT_EQ(sent, 2, "xsend(2)"))
|
||||
goto end;
|
||||
|
||||
avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC);
|
||||
ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return");
|
||||
|
||||
/* send data to p1 via bpf redirecting */
|
||||
sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0);
|
||||
if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)"))
|
||||
goto end;
|
||||
|
||||
/* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable
|
||||
* here since it may return immediately if prior data is already queued.
|
||||
*/
|
||||
expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf);
|
||||
avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC);
|
||||
ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return");
|
||||
|
||||
recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1);
|
||||
if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") ||
|
||||
!ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch"))
|
||||
goto end;
|
||||
end:
|
||||
if (c0 >= 0)
|
||||
close(c0);
|
||||
if (p0 >= 0)
|
||||
close(p0);
|
||||
if (c1 >= 0)
|
||||
close(c1);
|
||||
if (p1 >= 0)
|
||||
close(p1);
|
||||
test_sockmap_pass_prog__destroy(skel);
|
||||
}
|
||||
|
||||
void test_sockmap_basic(void)
|
||||
{
|
||||
if (test__start_subtest("sockmap create_update_free"))
|
||||
@@ -1108,4 +1380,14 @@ void test_sockmap_basic(void)
|
||||
test_sockmap_skb_verdict_vsock_poll();
|
||||
if (test__start_subtest("sockmap vsock unconnected"))
|
||||
test_sockmap_vsock_unconnected();
|
||||
if (test__start_subtest("sockmap with zc"))
|
||||
test_sockmap_zc();
|
||||
if (test__start_subtest("sockmap recover"))
|
||||
test_sockmap_copied_seq(false);
|
||||
if (test__start_subtest("sockmap recover with strp"))
|
||||
test_sockmap_copied_seq(true);
|
||||
if (test__start_subtest("sockmap tcp multi channels"))
|
||||
test_sockmap_multi_channels(SOCK_STREAM);
|
||||
if (test__start_subtest("sockmap udp multi channels"))
|
||||
test_sockmap_multi_channels(SOCK_DGRAM);
|
||||
}
|
||||
|
||||
@@ -44,4 +44,18 @@ int prog_skb_parser(struct __sk_buff *skb)
|
||||
return SK_PASS;
|
||||
}
|
||||
|
||||
SEC("sk_skb/stream_verdict")
|
||||
int prog_skb_verdict_ingress(struct __sk_buff *skb)
|
||||
{
|
||||
int one = 1;
|
||||
|
||||
return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS);
|
||||
}
|
||||
|
||||
SEC("sk_skb/stream_parser")
|
||||
int prog_skb_verdict_ingress_strp(struct __sk_buff *skb)
|
||||
{
|
||||
return skb->len;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
Reference in New Issue
Block a user