From 20b9d7ac48526ce9a14106241e76e8382d126a60 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:40 +0200
Subject: [PATCH 1/8] bpf: avoid excessive stack usage for perf_sample_data

perf_sample_data consumes 386 bytes on stack, reduce excessive stack
usage and move it to per cpu buffer. It's allowed due to preemption
being disabled for tracing, xdp and tc programs, thus at all times
only one program can run on a specific CPU and programs cannot run
from interrupt. We similarly also handle bpf_pt_regs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 08eb072430b9..051d7fca0c09 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -266,14 +266,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 			u64 flags, struct perf_raw_record *raw)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
-	struct perf_sample_data sample_data;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 
@@ -294,9 +296,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
-	perf_sample_data_init(&sample_data, 0, 0);
-	sample_data.raw = raw;
-	perf_event_output(event, &sample_data, regs);
+	perf_sample_data_init(sd, 0, 0);
+	sd->raw = raw;
+	perf_event_output(event, sd, regs);
 	return 0;
 }
 

From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:41 +0200
Subject: [PATCH 2/8] bpf: don't check spilled reg state for non-STACK_SPILLed
 type slots

spilled_regs[] state is only used for stack slots of type STACK_SPILL,
never for STACK_MISC. Right now, in states_equal(), even if we have
old and current stack state of type STACK_MISC, we compare spilled_regs[]
for that particular offset. Just skip these like we do everywhere else.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14ccb0759fa4..d031b3b0752e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			return false;
 		if (i % BPF_REG_SIZE)
 			continue;
+		if (old->stack_slot_type[i] != STACK_SPILL)
+			continue;
 		if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
 			   &cur->spilled_regs[i / BPF_REG_SIZE],
 			   sizeof(old->spilled_regs[0])))

From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:42 +0200
Subject: [PATCH 3/8] bpf: reset id on CONST_IMM transition

Whenever we set the register to the type CONST_IMM, we currently don't
reset the id to 0. id member is not used in CONST_IMM case, so don't
let it become stale, where pruning won't be able to match later on.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d031b3b0752e..d195d825515a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			 */
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = insn->imm;
+			regs[insn->dst_reg].id = 0;
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
 			regs[insn->dst_reg].min_align = calc_align(insn->imm);
@@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		regs[insn->dst_reg].type = CONST_IMM;
 		regs[insn->dst_reg].imm = imm;
+		regs[insn->dst_reg].id = 0;
 		return 0;
 	}
 

From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:43 +0200
Subject: [PATCH 4/8] bpf: reset id on spilled regs in clear_all_pkt_pointers

Right now, we don't reset the id of spilled registers in case of
clear_all_pkt_pointers(). Given pkt_pointers are highly likely to
contain an id, do so by reusing __mark_reg_unknown_value().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d195d825515a..519a6144d3d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg->type != PTR_TO_PACKET &&
 		    reg->type != PTR_TO_PACKET_END)
 			continue;
-		reg->type = UNKNOWN_VALUE;
-		reg->imm = 0;
+		__mark_reg_unknown_value(state->spilled_regs,
+					 i / BPF_REG_SIZE);
 	}
 }
 

From 5ecf51fd9cc69a4a6099340b30f8171c7cd04394 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:44 +0200
Subject: [PATCH 5/8] bpf, tests: add a test for htab lookup + update traversal

Add a test case to track behaviour when traversing and updating the
htab map. We recently used such traversal, so it's quite useful to
keep it as an example in selftests.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/bpf/test_maps.c | 50 +++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 93314524de0d..79601c81e169 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -239,6 +239,54 @@ static void test_hashmap_percpu(int task, void *data)
 	close(fd);
 }
 
+static void test_hashmap_walk(int task, void *data)
+{
+	int fd, i, max_entries = 100000;
+	long long key, value, next_key;
+	bool next_key_valid = true;
+
+	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+			    max_entries, map_flags);
+	if (fd < 0) {
+		printf("Failed to create hashmap '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	for (i = 0; i < max_entries; i++) {
+		key = i; value = key;
+		assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == 0);
+	}
+
+	for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key,
+					 &next_key) == 0; i++) {
+		key = next_key;
+		assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+	}
+
+	assert(i == max_entries);
+
+	assert(bpf_map_get_next_key(fd, NULL, &key) == 0);
+	for (i = 0; next_key_valid; i++) {
+		next_key_valid = bpf_map_get_next_key(fd, &key, &next_key) == 0;
+		assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+		value++;
+		assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0);
+		key = next_key;
+	}
+
+	assert(i == max_entries);
+
+	for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key,
+					 &next_key) == 0; i++) {
+		key = next_key;
+		assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+		assert(value - 1 == key);
+	}
+
+	assert(i == max_entries);
+	close(fd);
+}
+
 static void test_arraymap(int task, void *data)
 {
 	int key, next_key, fd;
@@ -464,6 +512,7 @@ static void test_map_stress(void)
 	run_parallel(100, test_hashmap, NULL);
 	run_parallel(100, test_hashmap_percpu, NULL);
 	run_parallel(100, test_hashmap_sizes, NULL);
+	run_parallel(100, test_hashmap_walk, NULL);
 
 	run_parallel(100, test_arraymap, NULL);
 	run_parallel(100, test_arraymap_percpu, NULL);
@@ -549,6 +598,7 @@ static void run_all_tests(void)
 {
 	test_hashmap(0, NULL);
 	test_hashmap_percpu(0, NULL);
+	test_hashmap_walk(0, NULL);
 
 	test_arraymap(0, NULL);
 	test_arraymap_percpu(0, NULL);

From f735b64926da36aadc0737d29dde587ac1e2e2c2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:45 +0200
Subject: [PATCH 6/8] bpf, tests: set rlimit also for test_align, so it doesn't
 fail

When running all the tests, through 'make run_tests', I had
test_align failing due to insufficient rlimit. Set it the same
way as all other test cases from BPF selftests do, so that
test case properly loads everything.

  [...]
  Summary: 7 PASSED, 1 FAILED
  selftests: test_progs [PASS]
  /home/foo/net-next/tools/testing/selftests/bpf
  Test   0: mov ... Failed to load program.
  FAIL
  Test   1: shift ... Failed to load program.
  FAIL
  Test   2: addsub ... Failed to load program.
  FAIL
  Test   3: mul ... Failed to load program.
  FAIL
  Test   4: unknown shift ... Failed to load program.
  FAIL
  Test   5: unknown mul ... Failed to load program.
  FAIL
  Test   6: packet const offset ... Failed to load program.
  FAIL
  Test   7: packet variable offset ... Failed to load program.
  FAIL
  Results: 0 pass 8 fail
  selftests: test_align [PASS]
  [...]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/bpf/test_align.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index 9644d4e069de..1426594fdf6b 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -9,6 +9,8 @@
 #include <stddef.h>
 #include <stdbool.h>
 
+#include <sys/resource.h>
+
 #include <linux/unistd.h>
 #include <linux/filter.h>
 #include <linux/bpf_perf_event.h>
@@ -432,6 +434,9 @@ static int do_test(unsigned int from, unsigned int to)
 int main(int argc, char **argv)
 {
 	unsigned int from = 0, to = ARRAY_SIZE(tests);
+	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
+
+	setrlimit(RLIMIT_MEMLOCK, &rinf);
 
 	if (argc == 3) {
 		unsigned int l = atoi(argv[argc - 2]);

From 966789fb8636a35c3263076fea382263c5e56802 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:46 +0200
Subject: [PATCH 7/8] bpf: remove cg_skb_func_proto and use
 sk_filter_func_proto directly

Since cg_skb_func_proto() doesn't do anything else than just calling
into sk_filter_func_proto(), remove it and set sk_filter_func_proto()
directly for .get_func_proto callback.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 946f758d44f2..4867391126e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2774,12 +2774,6 @@ xdp_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static const struct bpf_func_proto *
-cg_skb_func_proto(enum bpf_func_id func_id)
-{
-	return sk_filter_func_proto(func_id);
-}
-
 static const struct bpf_func_proto *
 lwt_inout_func_proto(enum bpf_func_id func_id)
 {
@@ -3344,7 +3338,7 @@ const struct bpf_verifier_ops xdp_prog_ops = {
 };
 
 const struct bpf_verifier_ops cg_skb_prog_ops = {
-	.get_func_proto		= cg_skb_func_proto,
+	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.test_run		= bpf_prog_test_run_skb,

From ded092cd73c2c56a394b936f86897f29b2e131c0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:47 +0200
Subject: [PATCH 8/8] bpf: add bpf_set_hash helper for tc progs

Allow for tc BPF programs to set a skb->hash, apart from clearing
and triggering a recalc that we have right now. It allows for BPF
to implement a custom hashing routine for skb_get_hash().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h       |  8 +++++++-
 net/core/filter.c              | 20 ++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  8 +++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9b2c10b45733..f94b48b168dc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -513,6 +513,11 @@ union bpf_attr {
  *     Get the owner uid of the socket stored inside sk_buff.
  *     @skb: pointer to skb
  *     Return: uid of the socket owner on success or overflowuid if failed.
+ *
+ * u32 bpf_set_hash(skb, hash)
+ *     Set full skb->hash.
+ *     @skb: pointer to skb
+ *     @hash: hash to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -562,7 +567,8 @@ union bpf_attr {
 	FN(xdp_adjust_head),		\
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
-	FN(get_socket_uid),
+	FN(get_socket_uid),		\
+	FN(set_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 4867391126e4..a65a3b25e104 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1874,6 +1874,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
+{
+	/* Set user specified hash as L4(+), so that it gets returned
+	 * on skb_get_hash() call unless BPF prog later on triggers a
+	 * skb_clear_hash().
+	 */
+	__skb_set_sw_hash(skb, hash, true);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_proto = {
+	.func		= bpf_set_hash,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	   u16, vlan_tci)
 {
@@ -2744,6 +2762,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_hash_recalc_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_set_hash:
+		return &bpf_set_hash_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9b2c10b45733..f94b48b168dc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -513,6 +513,11 @@ union bpf_attr {
  *     Get the owner uid of the socket stored inside sk_buff.
  *     @skb: pointer to skb
  *     Return: uid of the socket owner on success or overflowuid if failed.
+ *
+ * u32 bpf_set_hash(skb, hash)
+ *     Set full skb->hash.
+ *     @skb: pointer to skb
+ *     @hash: hash to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -562,7 +567,8 @@ union bpf_attr {
 	FN(xdp_adjust_head),		\
 	FN(probe_read_str),		\
 	FN(get_socket_cookie),		\
-	FN(get_socket_uid),
+	FN(get_socket_uid),		\
+	FN(set_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call