From 41c95dd6a604dc3f5fae55c99c138cc8e7fec76e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:23 +0200 Subject: [PATCH 1/5] bpf: Allow a TCP CC to write sk_pacing_rate and sk_pacing_status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A CC that implements tcp_congestion_ops.cong_control() should be able to control sk_pacing_rate and set sk_pacing_status, since tcp_update_pacing_rate() is never called in this case. A built-in CC or one from a kernel module is already able to write to both struct sock members. For a BPF program, write access has not been allowed, yet. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-2-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- net/ipv4/bpf_tcp_ca.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index f79ab942f03b..1f5c53ede4e5 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -111,6 +111,12 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, } switch (off) { + case offsetof(struct sock, sk_pacing_rate): + end = offsetofend(struct sock, sk_pacing_rate); + break; + case offsetof(struct sock, sk_pacing_status): + end = offsetofend(struct sock, sk_pacing_status); + break; case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): end = offsetofend(struct inet_connection_sock, icsk_ca_priv); break; From 9f0265e921dee14096943ee11f793fa076aa7a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:24 +0200 Subject: [PATCH 2/5] bpf: Require only one of cong_avoid() and cong_control() from a TCP CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the check for required and optional functions in a struct tcp_congestion_ops from bpf_tcp_ca.c. Rely on tcp_register_congestion_control() to reject a BPF CC that does not implement all required functions, as it will do for a non-BPF CC. When a CC implements tcp_congestion_ops.cong_control(), the alternate cong_avoid() is not in use in the TCP stack. Previously, a BPF CC was still forced to implement cong_avoid() as a no-op since it was non-optional in bpf_tcp_ca.c. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-3-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_struct_ops.c | 7 +++---- net/ipv4/bpf_tcp_ca.c | 33 --------------------------------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d9a3c9207240..7e0068c3399c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -503,10 +503,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - /* Error during st_ops->reg(). It is very unlikely since - * the above init_member() should have caught it earlier - * before reg(). The only possibility is if there was a race - * in registering the struct_ops (under the same name) to + /* Error during st_ops->reg(). Can happen if this struct_ops needs to be + * verified as a whole, after all init_member() calls. Can also happen if + * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ set_memory_nx((long)st_map->image, 1); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 1f5c53ede4e5..7a181631b995 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -14,18 +14,6 @@ /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ extern struct bpf_struct_ops bpf_tcp_congestion_ops; -static u32 optional_ops[] = { - offsetof(struct tcp_congestion_ops, init), - offsetof(struct tcp_congestion_ops, release), - offsetof(struct tcp_congestion_ops, set_state), - offsetof(struct tcp_congestion_ops, cwnd_event), - offsetof(struct tcp_congestion_ops, in_ack_event), - offsetof(struct tcp_congestion_ops, pkts_acked), - offsetof(struct tcp_congestion_ops, min_tso_segs), - offsetof(struct tcp_congestion_ops, sndbuf_expand), - offsetof(struct tcp_congestion_ops, cong_control), -}; - static u32 unsupported_ops[] = { offsetof(struct tcp_congestion_ops, get_info), }; @@ -51,18 +39,6 @@ static int bpf_tcp_ca_init(struct btf *btf) return 0; } -static bool is_optional(u32 member_offset) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { - if (member_offset == optional_ops[i]) - return true; - } - - return false; -} - static bool is_unsupported(u32 member_offset) { unsigned int i; @@ -246,7 +222,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, { const struct tcp_congestion_ops *utcp_ca; struct tcp_congestion_ops *tcp_ca; - int prog_fd; u32 moff; utcp_ca = (const struct tcp_congestion_ops *)udata; @@ -268,14 +243,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, return 1; } - if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) - return 0; - - /* Ensure bpf_prog is provided for compulsory func ptr */ - prog_fd = (int)(*(unsigned long *)(udata + moff)); - if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) - return -EINVAL; - return 0; } From 6e945d57cc9f6e27893d57a419434a2859ba6f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:25 +0200 Subject: [PATCH 3/5] selftests/bpf: Test a BPF CC writing sk_pacing_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF is allowed to write sk_pacing_rate and sk_pacing_status in struct sock. This is needed when cong_control() is implemented and used. Signed-off-by: Jörn-Thorben Hinz Link: https://lore.kernel.org/r/20220622191227.898118-4-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 19 ++++++ .../bpf/progs/tcp_ca_write_sk_pacing.c | 60 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e9a9a31b2ffe..e79f3f5a9d33 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -9,6 +9,7 @@ #include "bpf_cubic.skel.h" #include "bpf_tcp_nogpl.skel.h" #include "bpf_dctcp_release.skel.h" +#include "tcp_ca_write_sk_pacing.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -322,6 +323,22 @@ static void test_rel_setsockopt(void) bpf_dctcp_release__destroy(rel_skel); } +static void test_write_sk_pacing(void) +{ + struct tcp_ca_write_sk_pacing *skel; + struct bpf_link *link; + + skel = tcp_ca_write_sk_pacing__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + link = bpf_map__attach_struct_ops(skel->maps.write_sk_pacing); + ASSERT_OK_PTR(link, "attach_struct_ops"); + + bpf_link__destroy(link); + tcp_ca_write_sk_pacing__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -334,4 +351,6 @@ void test_bpf_tcp_ca(void) test_dctcp_fallback(); if (test__start_subtest("rel_setsockopt")) test_rel_setsockopt(); + if (test__start_subtest("write_sk_pacing")) + test_write_sk_pacing(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c new file mode 100644 index 000000000000..43447704cf0e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define USEC_PER_SEC 1000000UL + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +SEC("struct_ops/write_sk_pacing_init") +void BPF_PROG(write_sk_pacing_init, struct sock *sk) +{ +#ifdef ENABLE_ATOMICS_TESTS + __sync_bool_compare_and_swap(&sk->sk_pacing_status, SK_PACING_NONE, + SK_PACING_NEEDED); +#else + sk->sk_pacing_status = SK_PACING_NEEDED; +#endif +} + +SEC("struct_ops/write_sk_pacing_cong_control") +void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, + const struct rate_sample *rs) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned long rate = + ((tp->snd_cwnd * tp->mss_cache * USEC_PER_SEC) << 3) / + (tp->srtt_us ?: 1U << 3); + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +SEC("struct_ops/write_sk_pacing_ssthresh") +__u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +SEC("struct_ops/write_sk_pacing_undo_cwnd") +__u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +SEC(".struct_ops") +struct tcp_congestion_ops write_sk_pacing = { + .init = (void *)write_sk_pacing_init, + .cong_control = (void *)write_sk_pacing_cong_control, + .ssthresh = (void *)write_sk_pacing_ssthresh, + .undo_cwnd = (void *)write_sk_pacing_undo_cwnd, + .name = "bpf_w_sk_pacing", +}; From 0735627d78caa56f219dc14608ce0bdbd045e07e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:26 +0200 Subject: [PATCH 4/5] selftests/bpf: Test an incomplete BPF CC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF providing neither cong_avoid() nor cong_control() is correctly rejected. This check solely depends on tcp_register_congestion_control() now, which is invoked during bpf_map__attach_struct_ops(). Signed-off-by: Jörn-Thorben Hinz Link: https://lore.kernel.org/r/20220622191227.898118-5-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 22 ++++++++++++ .../bpf/progs/tcp_ca_incompl_cong_ops.c | 35 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index e79f3f5a9d33..194d07310531 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -10,6 +10,7 @@ #include "bpf_tcp_nogpl.skel.h" #include "bpf_dctcp_release.skel.h" #include "tcp_ca_write_sk_pacing.skel.h" +#include "tcp_ca_incompl_cong_ops.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -339,6 +340,25 @@ static void test_write_sk_pacing(void) tcp_ca_write_sk_pacing__destroy(skel); } +static void test_incompl_cong_ops(void) +{ + struct tcp_ca_incompl_cong_ops *skel; + struct bpf_link *link; + + skel = tcp_ca_incompl_cong_ops__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + /* That cong_avoid() and cong_control() are missing is only reported at + * this point: + */ + link = bpf_map__attach_struct_ops(skel->maps.incompl_cong_ops); + ASSERT_ERR_PTR(link, "attach_struct_ops"); + + bpf_link__destroy(link); + tcp_ca_incompl_cong_ops__destroy(skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -353,4 +373,6 @@ void test_bpf_tcp_ca(void) test_rel_setsockopt(); if (test__start_subtest("write_sk_pacing")) test_write_sk_pacing(); + if (test__start_subtest("incompl_cong_ops")) + test_incompl_cong_ops(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c new file mode 100644 index 000000000000..7bb872fb22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +SEC("struct_ops/incompl_cong_ops_ssthresh") +__u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +SEC("struct_ops/incompl_cong_ops_undo_cwnd") +__u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +SEC(".struct_ops") +struct tcp_congestion_ops incompl_cong_ops = { + /* Intentionally leaving out any of the required cong_avoid() and + * cong_control() here. + */ + .ssthresh = (void *)incompl_cong_ops_ssthresh, + .undo_cwnd = (void *)incompl_cong_ops_undo_cwnd, + .name = "bpf_incompl_ops", +}; From f14a3f644a1c5a2e2dbe6073f51793119a12e6ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Wed, 22 Jun 2022 21:12:27 +0200 Subject: [PATCH 5/5] selftests/bpf: Test a BPF CC implementing the unsupported get_info() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether a TCP CC implemented in BPF providing get_info() is rejected correctly. get_info() is unsupported in a BPF CC. The check for required functions in a BPF CC has moved, this test ensures unsupported functions are still rejected correctly. Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220622191227.898118-6-jthinz@mailbox.tu-berlin.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 20 ++++++++++++++++++ .../bpf/progs/tcp_ca_unsupp_cong_op.c | 21 +++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 194d07310531..2959a52ced06 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -11,6 +11,7 @@ #include "bpf_dctcp_release.skel.h" #include "tcp_ca_write_sk_pacing.skel.h" #include "tcp_ca_incompl_cong_ops.skel.h" +#include "tcp_ca_unsupp_cong_op.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -359,6 +360,23 @@ static void test_incompl_cong_ops(void) tcp_ca_incompl_cong_ops__destroy(skel); } +static void test_unsupp_cong_op(void) +{ + libbpf_print_fn_t old_print_fn; + struct tcp_ca_unsupp_cong_op *skel; + + err_str = "attach to unsupported member get_info"; + found = false; + old_print_fn = libbpf_set_print(libbpf_debug_print); + + skel = tcp_ca_unsupp_cong_op__open_and_load(); + ASSERT_NULL(skel, "open_and_load"); + ASSERT_EQ(found, true, "expected_err_msg"); + + tcp_ca_unsupp_cong_op__destroy(skel); + libbpf_set_print(old_print_fn); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -375,4 +393,6 @@ void test_bpf_tcp_ca(void) test_write_sk_pacing(); if (test__start_subtest("incompl_cong_ops")) test_incompl_cong_ops(); + if (test__start_subtest("unsupp_cong_op")) + test_unsupp_cong_op(); } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c new file mode 100644 index 000000000000..c06f4a41c21a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/unsupp_cong_op_get_info") +size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + return 0; +} + +SEC(".struct_ops") +struct tcp_congestion_ops unsupp_cong_op = { + .get_info = (void *)unsupp_cong_op_get_info, + .name = "bpf_unsupp_op", +};