From 20b9d7ac48526ce9a14106241e76e8382d126a60 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:40 +0200 Subject: [PATCH 1/8] bpf: avoid excessive stack usage for perf_sample_data perf_sample_data consumes 386 bytes on stack, reduce excessive stack usage and move it to per cpu buffer. It's allowed due to preemption being disabled for tracing, xdp and tc programs, thus at all times only one program can run on a specific CPU and programs cannot run from interrupt. We similarly also handle bpf_pt_regs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08eb072430b9..051d7fca0c09 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -266,14 +266,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw) { struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; @@ -294,9 +296,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = raw; - perf_event_output(event, &sample_data, regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = raw; + perf_event_output(event, sd, regs); return 0; } From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:41 +0200 Subject: [PATCH 2/8] bpf: don't check spilled reg state for non-STACK_SPILLed type slots spilled_regs[] state is only used for stack slots of type STACK_SPILL, never for STACK_MISC. Right now, in states_equal(), even if we have old and current stack state of type STACK_MISC, we compare spilled_regs[] for that particular offset. Just skip these like we do everywhere else. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..d031b3b0752e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (i % BPF_REG_SIZE) continue; + if (old->stack_slot_type[i] != STACK_SPILL) + continue; if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], &cur->spilled_regs[i / BPF_REG_SIZE], sizeof(old->spilled_regs[0]))) From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:42 +0200 Subject: [PATCH 3/8] bpf: reset id on CONST_IMM transition Whenever we set the register to the type CONST_IMM, we currently don't reset the id to 0. id member is not used in CONST_IMM case, so don't let it become stale, where pruning won't be able to match later on. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d031b3b0752e..d195d825515a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].id = 0; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); @@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].id = 0; return 0; } From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:43 +0200 Subject: [PATCH 4/8] bpf: reset id on spilled regs in clear_all_pkt_pointers Right now, we don't reset the id of spilled registers in case of clear_all_pkt_pointers(). Given pkt_pointers are highly likely to contain an id, do so by reusing __mark_reg_unknown_value(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d195d825515a..519a6144d3d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; - reg->type = UNKNOWN_VALUE; - reg->imm = 0; + __mark_reg_unknown_value(state->spilled_regs, + i / BPF_REG_SIZE); } } From 5ecf51fd9cc69a4a6099340b30f8171c7cd04394 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:44 +0200 Subject: [PATCH 5/8] bpf, tests: add a test for htab lookup + update traversal Add a test case to track behaviour when traversing and updating the htab map. We recently used such traversal, so it's quite useful to keep it as an example in selftests. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_maps.c | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 93314524de0d..79601c81e169 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -239,6 +239,54 @@ static void test_hashmap_percpu(int task, void *data) close(fd); } +static void test_hashmap_walk(int task, void *data) +{ + int fd, i, max_entries = 100000; + long long key, value, next_key; + bool next_key_valid = true; + + fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), + max_entries, map_flags); + if (fd < 0) { + printf("Failed to create hashmap '%s'!\n", strerror(errno)); + exit(1); + } + + for (i = 0; i < max_entries; i++) { + key = i; value = key; + assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == 0); + } + + for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key, + &next_key) == 0; i++) { + key = next_key; + assert(bpf_map_lookup_elem(fd, &key, &value) == 0); + } + + assert(i == max_entries); + + assert(bpf_map_get_next_key(fd, NULL, &key) == 0); + for (i = 0; next_key_valid; i++) { + next_key_valid = bpf_map_get_next_key(fd, &key, &next_key) == 0; + assert(bpf_map_lookup_elem(fd, &key, &value) == 0); + value++; + assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0); + key = next_key; + } + + assert(i == max_entries); + + for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key, + &next_key) == 0; i++) { + key = next_key; + assert(bpf_map_lookup_elem(fd, &key, &value) == 0); + assert(value - 1 == key); + } + + assert(i == max_entries); + close(fd); +} + static void test_arraymap(int task, void *data) { int key, next_key, fd; @@ -464,6 +512,7 @@ static void test_map_stress(void) run_parallel(100, test_hashmap, NULL); run_parallel(100, test_hashmap_percpu, NULL); run_parallel(100, test_hashmap_sizes, NULL); + run_parallel(100, test_hashmap_walk, NULL); run_parallel(100, test_arraymap, NULL); run_parallel(100, test_arraymap_percpu, NULL); @@ -549,6 +598,7 @@ static void run_all_tests(void) { test_hashmap(0, NULL); test_hashmap_percpu(0, NULL); + test_hashmap_walk(0, NULL); test_arraymap(0, NULL); test_arraymap_percpu(0, NULL); From f735b64926da36aadc0737d29dde587ac1e2e2c2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:45 +0200 Subject: [PATCH 6/8] bpf, tests: set rlimit also for test_align, so it doesn't fail When running all the tests, through 'make run_tests', I had test_align failing due to insufficient rlimit. Set it the same way as all other test cases from BPF selftests do, so that test case properly loads everything. [...] Summary: 7 PASSED, 1 FAILED selftests: test_progs [PASS] /home/foo/net-next/tools/testing/selftests/bpf Test 0: mov ... Failed to load program. FAIL Test 1: shift ... Failed to load program. FAIL Test 2: addsub ... Failed to load program. FAIL Test 3: mul ... Failed to load program. FAIL Test 4: unknown shift ... Failed to load program. FAIL Test 5: unknown mul ... Failed to load program. FAIL Test 6: packet const offset ... Failed to load program. FAIL Test 7: packet variable offset ... Failed to load program. FAIL Results: 0 pass 8 fail selftests: test_align [PASS] [...] Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_align.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 9644d4e069de..1426594fdf6b 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -9,6 +9,8 @@ #include #include +#include + #include #include #include @@ -432,6 +434,9 @@ static int do_test(unsigned int from, unsigned int to) int main(int argc, char **argv) { unsigned int from = 0, to = ARRAY_SIZE(tests); + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; + + setrlimit(RLIMIT_MEMLOCK, &rinf); if (argc == 3) { unsigned int l = atoi(argv[argc - 2]); From 966789fb8636a35c3263076fea382263c5e56802 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:46 +0200 Subject: [PATCH 7/8] bpf: remove cg_skb_func_proto and use sk_filter_func_proto directly Since cg_skb_func_proto() doesn't do anything else than just calling into sk_filter_func_proto(), remove it and set sk_filter_func_proto() directly for .get_func_proto callback. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 946f758d44f2..4867391126e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2774,12 +2774,6 @@ xdp_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto * -cg_skb_func_proto(enum bpf_func_id func_id) -{ - return sk_filter_func_proto(func_id); -} - static const struct bpf_func_proto * lwt_inout_func_proto(enum bpf_func_id func_id) { @@ -3344,7 +3338,7 @@ const struct bpf_verifier_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_prog_ops = { - .get_func_proto = cg_skb_func_proto, + .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .test_run = bpf_prog_test_run_skb, From ded092cd73c2c56a394b936f86897f29b2e131c0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:47 +0200 Subject: [PATCH 8/8] bpf: add bpf_set_hash helper for tc progs Allow for tc BPF programs to set a skb->hash, apart from clearing and triggering a recalc that we have right now. It allows for BPF to implement a custom hashing routine for skb_get_hash(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++++++- net/core/filter.c | 20 ++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 8 +++++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9b2c10b45733..f94b48b168dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -513,6 +513,11 @@ union bpf_attr { * Get the owner uid of the socket stored inside sk_buff. * @skb: pointer to skb * Return: uid of the socket owner on success or overflowuid if failed. + * + * u32 bpf_set_hash(skb, hash) + * Set full skb->hash. + * @skb: pointer to skb + * @hash: hash to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -562,7 +567,8 @@ union bpf_attr { FN(xdp_adjust_head), \ FN(probe_read_str), \ FN(get_socket_cookie), \ - FN(get_socket_uid), + FN(get_socket_uid), \ + FN(set_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 4867391126e4..a65a3b25e104 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1874,6 +1874,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +{ + /* Set user specified hash as L4(+), so that it gets returned + * on skb_get_hash() call unless BPF prog later on triggers a + * skb_clear_hash(). + */ + __skb_set_sw_hash(skb, hash, true); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_proto = { + .func = bpf_set_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { @@ -2744,6 +2762,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_set_hash: + return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9b2c10b45733..f94b48b168dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -513,6 +513,11 @@ union bpf_attr { * Get the owner uid of the socket stored inside sk_buff. * @skb: pointer to skb * Return: uid of the socket owner on success or overflowuid if failed. + * + * u32 bpf_set_hash(skb, hash) + * Set full skb->hash. + * @skb: pointer to skb + * @hash: hash to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -562,7 +567,8 @@ union bpf_attr { FN(xdp_adjust_head), \ FN(probe_read_str), \ FN(get_socket_cookie), \ - FN(get_socket_uid), + FN(get_socket_uid), \ + FN(set_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call