From e06fa9c16ce4b740996189fa5610eabcee734e6c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 24 Aug 2018 22:08:50 +0200
Subject: [PATCH 1/8] bpf, sockmap: fix potential use after free in
 bpf_tcp_close

bpf_tcp_close() we pop the psock linkage to a map via psock_map_pop().
A parallel update on the sock hash map can happen between psock_map_pop()
and lookup_elem_raw() where we override the element under link->hash /
link->key. In bpf_tcp_close()'s lookup_elem_raw() we subsequently only
test whether an element is present, but we do not test whether the
element is infact the element we were looking for.

We lock the sock in bpf_tcp_close() during that time, so do we hold
the lock in sock_hash_update_elem(). However, the latter locks the
sock which is newly updated, not the one we're purging from the hash
table. This means that while one CPU is doing the lookup from bpf_tcp_close(),
another CPU is doing the map update in parallel, dropped our sock from
the hlist and released the psock.

Subsequently the first CPU will find the new sock and attempts to drop
and release the old sock yet another time. Fix is that we need to check
the elements for a match after lookup, similar as we do in the sock map.
Note that the hash tab elems are freed via RCU, so access to their
link->hash / link->key is fine since we're under RCU read side there.

Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index cf5195c7c331..01879e4d599a 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 			/* If another thread deleted this object skip deletion.
 			 * The refcnt on psock may or may not be zero.
 			 */
-			if (l) {
+			if (l && l == link) {
 				hlist_del_rcu(&link->hash_node);
 				smap_release_sock(psock, link->sk);
 				free_htab_elem(htab, link);

From 15c480efab01197c965ce0562a43ffedd852b8f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 24 Aug 2018 22:08:51 +0200
Subject: [PATCH 2/8] bpf, sockmap: fix psock refcount leak in bpf_tcp_recvmsg

In bpf_tcp_recvmsg() we first took a reference on the psock, however
once we find that there are skbs in the normal socket's receive queue
we return with processing them through tcp_recvmsg(). Problem is that
we leak the taken reference on the psock in that path. Given we don't
really do anything with the psock at this point, move the skb_queue_empty()
test before we fetch the psock to fix this case.

Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 01879e4d599a..26d8a3053407 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -912,6 +912,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 
 	rcu_read_lock();
 	psock = smap_psock_sk(sk);
@@ -922,9 +924,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		goto out;
 	rcu_read_unlock();
 
-	if (!skb_queue_empty(&sk->sk_receive_queue))
-		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
 	lock_sock(sk);
 bytes_ready:
 	while (copied != len) {

From 3f6e138d41ddff196f452993528cfe75762ede0f Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Mon, 27 Aug 2018 21:30:42 +0200
Subject: [PATCH 3/8] bpf: fix build error with clang

Building the newly introduced BPF_PROG_TYPE_SK_REUSEPORT leads to
a compile time error when building with clang:
net/core/filter.o: In function `sk_reuseport_convert_ctx_access':
  ../net/core/filter.c:7284: undefined reference to `__compiletime_assert_7284'

It seems that clang has issues resolving hweight_long at compile
time. Since SK_FL_PROTO_MASK is a constant, we can use the interface
for known constant arguments which works fine with clang.

Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT")
Signed-off-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index c25eb36f1320..7a2430945c71 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7281,7 +7281,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct sk_reuseport_md, ip_protocol):
-		BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
+		BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
 		SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
 						    BPF_W, 0);
 		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);

From 501ca81760c204ec59b73e4a00bee5971fc0f1b1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 24 Aug 2018 17:37:00 -0700
Subject: [PATCH 4/8] bpf: sockmap, decrement copied count correctly in
 redirect error case

Currently, when a redirect occurs in sockmap and an error occurs in
the redirect call we unwind the scatterlist once in the error path
of bpf_tcp_sendmsg_do_redirect() and then again in sendmsg(). Then
in the error path of sendmsg we decrement the copied count by the
send size.

However, its possible we partially sent data before the error was
generated. This can happen if do_tcp_sendpages() partially sends the
scatterlist before encountering a memory pressure error. If this
happens we need to decrement the copied value (the value tracking
how many bytes were actually sent to TCP stack) by the number of
remaining bytes _not_ the entire send size. Otherwise we risk
confusing userspace.

Also we don't need two calls to free the scatterlist one is
good enough. So remove the one in bpf_tcp_sendmsg_do_redirect() and
then properly reduce copied by the number of remaining bytes which
may in fact be the entire send size if no bytes were sent.

To do this use bool to indicate if free_start_sg() should do mem
accounting or not.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 45 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 26d8a3053407..ce63e5801746 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk)
 }
 
 static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
 
 static void bpf_tcp_release(struct sock *sk)
 {
@@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk)
 		goto out;
 
 	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork);
+		free_start_sg(psock->sock, psock->cork, true);
 		kfree(psock->cork);
 		psock->cork = NULL;
 	}
@@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	close_fun = psock->save_close;
 
 	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork);
+		free_start_sg(psock->sock, psock->cork, true);
 		kfree(psock->cork);
 		psock->cork = NULL;
 	}
 
 	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
 		list_del(&md->list);
-		free_start_sg(psock->sock, md);
+		free_start_sg(psock->sock, md, true);
 		kfree(md);
 	}
 
@@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes,
 	md->sg_start = i;
 }
 
-static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
+static int free_sg(struct sock *sk, int start,
+		   struct sk_msg_buff *md, bool charge)
 {
 	struct scatterlist *sg = md->sg_data;
 	int i = start, free = 0;
 
 	while (sg[i].length) {
 		free += sg[i].length;
-		sk_mem_uncharge(sk, sg[i].length);
+		if (charge)
+			sk_mem_uncharge(sk, sg[i].length);
 		if (!md->skb)
 			put_page(sg_page(&sg[i]));
 		sg[i].length = 0;
@@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
 	return free;
 }
 
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
 {
-	int free = free_sg(sk, md->sg_start, md);
+	int free = free_sg(sk, md->sg_start, md, charge);
 
 	md->sg_start = md->sg_end;
 	return free;
@@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
 
 static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
 {
-	return free_sg(sk, md->sg_curr, md);
+	return free_sg(sk, md->sg_curr, md, true);
 }
 
 static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
@@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 		list_add_tail(&r->list, &psock->ingress);
 		sk->sk_data_ready(sk);
 	} else {
-		free_start_sg(sk, r);
+		free_start_sg(sk, r, true);
 		kfree(r);
 	}
 
@@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 		release_sock(sk);
 	}
 	smap_release_sock(psock, sk);
-	if (unlikely(err))
-		goto out;
-	return 0;
+	return err;
 out_rcu:
 	rcu_read_unlock();
-out:
-	free_bytes_sg(NULL, send, md, false);
-	return err;
+	return 0;
 }
 
 static inline void bpf_md_init(struct smap_psock *psock)
@@ -822,7 +820,7 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
 	case __SK_PASS:
 		err = bpf_tcp_push(sk, send, m, flags, true);
 		if (unlikely(err)) {
-			*copied -= free_start_sg(sk, m);
+			*copied -= free_start_sg(sk, m, true);
 			break;
 		}
 
@@ -845,16 +843,17 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
 		lock_sock(sk);
 
 		if (unlikely(err < 0)) {
-			free_start_sg(sk, m);
+			int free = free_start_sg(sk, m, false);
+
 			psock->sg_size = 0;
 			if (!cork)
-				*copied -= send;
+				*copied -= free;
 		} else {
 			psock->sg_size -= send;
 		}
 
 		if (cork) {
-			free_start_sg(sk, m);
+			free_start_sg(sk, m, true);
 			psock->sg_size = 0;
 			kfree(m);
 			m = NULL;
@@ -1121,7 +1120,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 		err = sk_stream_wait_memory(sk, &timeo);
 		if (err) {
 			if (m && m != psock->cork)
-				free_start_sg(sk, m);
+				free_start_sg(sk, m, true);
 			goto out_err;
 		}
 	}
@@ -1580,13 +1579,13 @@ static void smap_gc_work(struct work_struct *w)
 		bpf_prog_put(psock->bpf_tx_msg);
 
 	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork);
+		free_start_sg(psock->sock, psock->cork, true);
 		kfree(psock->cork);
 	}
 
 	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
 		list_del(&md->list);
-		free_start_sg(psock->sock, md);
+		free_start_sg(psock->sock, md, true);
 		kfree(md);
 	}
 

From 5b24109b0563d45094c470684c1f8cea1af269f8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Aug 2018 16:15:35 +0200
Subject: [PATCH 5/8] bpf: fix several offset tests in bpf_msg_pull_data

While recently going over bpf_msg_pull_data(), I noticed three
issues which are fixed in here:

1) When we attempt to find the first scatterlist element (sge)
   for the start offset, we add len to the offset before we check
   for start < offset + len, whereas it should come after when
   we iterate to the next sge to accumulate the offsets. For
   example, given a start offset of 12 with a sge length of 8
   for the first sge in the list would lead us to determine this
   sge as the first sge thinking it covers first 16 bytes where
   start is located, whereas start sits in subsequent sges so
   we would end up pulling in the wrong data.

2) After figuring out the starting sge, we have a short-cut test
   in !msg->sg_copy[i] && bytes <= len. This checks whether it's
   not needed to make the page at the sge private where we can
   just exit by updating msg->data and msg->data_end. However,
   the length test is not fully correct. bytes <= len checks
   whether the requested bytes (end - start offsets) fit into the
   sge's length. The part that is missing is that start must not
   be sge length aligned. Meaning, the start offset into the sge
   needs to be accounted as well on top of the requested bytes
   as otherwise we can access the sge out of bounds. For example
   the sge could have length of 8, our requested bytes could have
   length of 8, but at a start offset of 4, so we also would need
   to pull in 4 bytes of the next sge, when we jump to the out
   label we do set msg->data to sg_virt(&sg[i]) + start - offset
   and msg->data_end to msg->data + bytes which would be oob.

3) The subsequent bytes < copy test for finding the last sge has
   the same issue as in point 2) but also it tests for less than
   rather than less or equal to. Meaning if the sge length is of
   8 and requested bytes of 8 while having the start aligned with
   the sge, we would unnecessarily go and pull in the next sge as
   well to make it private.

Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 7a2430945c71..ec4d67c0cf0c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2286,10 +2286,10 @@ BPF_CALL_4(bpf_msg_pull_data,
 	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
 {
 	unsigned int len = 0, offset = 0, copy = 0;
+	int bytes = end - start, bytes_sg_total;
 	struct scatterlist *sg = msg->sg_data;
 	int first_sg, last_sg, i, shift;
 	unsigned char *p, *to, *from;
-	int bytes = end - start;
 	struct page *page;
 
 	if (unlikely(flags || end <= start))
@@ -2299,9 +2299,9 @@ BPF_CALL_4(bpf_msg_pull_data,
 	i = msg->sg_start;
 	do {
 		len = sg[i].length;
-		offset += len;
 		if (start < offset + len)
 			break;
+		offset += len;
 		i++;
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
@@ -2310,7 +2310,11 @@ BPF_CALL_4(bpf_msg_pull_data,
 	if (unlikely(start >= offset + len))
 		return -EINVAL;
 
-	if (!msg->sg_copy[i] && bytes <= len)
+	/* The start may point into the sg element so we need to also
+	 * account for the headroom.
+	 */
+	bytes_sg_total = start - offset + bytes;
+	if (!msg->sg_copy[i] && bytes_sg_total <= len)
 		goto out;
 
 	first_sg = i;
@@ -2330,12 +2334,12 @@ BPF_CALL_4(bpf_msg_pull_data,
 		i++;
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
-		if (bytes < copy)
+		if (bytes_sg_total <= copy)
 			break;
 	} while (i != msg->sg_end);
 	last_sg = i;
 
-	if (unlikely(copy < end - start))
+	if (unlikely(bytes_sg_total > copy))
 		return -EINVAL;
 
 	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));

From 0e06b227c5221dd51b5569de93f3b9f532be4a32 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 29 Aug 2018 16:50:34 +0200
Subject: [PATCH 6/8] bpf: fix msg->data/data_end after sg shift repair in
 bpf_msg_pull_data

In the current code, msg->data is set as sg_virt(&sg[i]) + start - offset
and msg->data_end relative to it as msg->data + bytes. Using iterator i
to point to the updated starting scatterlist element holds true for some
cases, however not for all where we'd end up pointing out of bounds. It
is /correct/ for these ones:

1) When first finding the starting scatterlist element (sge) where we
   find that the page is already privately owned by the msg and where
   the requested bytes and headroom fit into the sge's length.

However, it's /incorrect/ for the following ones:

2) After we made the requested area private and updated the newly allocated
   page into first_sg slot of the scatterlist ring; when we find that no
   shift repair of the ring is needed where we bail out updating msg->data
   and msg->data_end. At that point i will point to last_sg, which in this
   case is the next elem of first_sg in the ring. The sge at that point
   might as well be invalid (e.g. i == msg->sg_end), which we use for
   setting the range of sg_virt(&sg[i]). The correct one would have been
   first_sg.

3) Similar as in 2) but when we find that a shift repair of the ring is
   needed. In this case we fix up all sges and stop once we've reached the
   end. In this case i will point to will point to the new msg->sg_end,
   and the sge at that point will be invalid. Again here the requested
   range sits in first_sg.

Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index ec4d67c0cf0c..b9225c55926a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2310,6 +2310,7 @@ BPF_CALL_4(bpf_msg_pull_data,
 	if (unlikely(start >= offset + len))
 		return -EINVAL;
 
+	first_sg = i;
 	/* The start may point into the sg element so we need to also
 	 * account for the headroom.
 	 */
@@ -2317,8 +2318,6 @@ BPF_CALL_4(bpf_msg_pull_data,
 	if (!msg->sg_copy[i] && bytes_sg_total <= len)
 		goto out;
 
-	first_sg = i;
-
 	/* At this point we need to linearize multiple scatterlist
 	 * elements or a single shared page. Either way we need to
 	 * copy into a linear buffer exclusively owned by BPF. Then
@@ -2400,7 +2399,7 @@ BPF_CALL_4(bpf_msg_pull_data,
 	if (msg->sg_end < 0)
 		msg->sg_end += MAX_SKB_FRAGS;
 out:
-	msg->data = sg_virt(&sg[i]) + start - offset;
+	msg->data = sg_virt(&sg[first_sg]) + start - offset;
 	msg->data_end = msg->data + bytes;
 
 	return 0;

From 2e43f95dd8ee62bc8bf57f2afac37fbd70c8d565 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 29 Aug 2018 16:50:35 +0200
Subject: [PATCH 7/8] bpf: fix shift upon scatterlist ring wrap-around in
 bpf_msg_pull_data

If first_sg and last_sg wraps around in the scatterlist ring, then we
need to account for that in the shift as well. E.g. crafting such msgs
where this is the case leads to a hang as shift becomes negative. E.g.
consider the following scenario:

  first_sg := 14     |=>    shift := -12     msg->sg_start := 10
  last_sg  :=  3     |                       msg->sg_end   :=  5

round  1:  i := 15, move_from :=   3, sg[15] := sg[  3]
round  2:  i :=  0, move_from := -12, sg[ 0] := sg[-12]
round  3:  i :=  1, move_from := -11, sg[ 1] := sg[-11]
round  4:  i :=  2, move_from := -10, sg[ 2] := sg[-10]
[...]
round 13:  i := 11, move_from :=  -1, sg[ 2] := sg[ -1]
round 14:  i := 12, move_from :=   0, sg[ 2] := sg[  0]
round 15:  i := 13, move_from :=   1, sg[ 2] := sg[  1]
round 16:  i := 14, move_from :=   2, sg[ 2] := sg[  2]
round 17:  i := 15, move_from :=   3, sg[ 2] := sg[  3]
[...]

This means we will loop forever and never hit the msg->sg_end condition
to break out of the loop. When we see that the ring wraps around, then
the shift should be MAX_SKB_FRAGS - first_sg + last_sg - 1. Meaning,
the remainder slots from the tail of the ring and the head until last_sg
combined.

Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index b9225c55926a..43ba5f8c38ca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2370,7 +2370,10 @@ BPF_CALL_4(bpf_msg_pull_data,
 	 * had a single entry though we can just replace it and
 	 * be done. Otherwise walk the ring and shift the entries.
 	 */
-	shift = last_sg - first_sg - 1;
+	WARN_ON_ONCE(last_sg == first_sg);
+	shift = last_sg > first_sg ?
+		last_sg - first_sg - 1 :
+		MAX_SKB_FRAGS - first_sg + last_sg - 1;
 	if (!shift)
 		goto out;
 

From a8cf76a9023bc6709b1361d06bb2fae5227b9d68 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 29 Aug 2018 16:50:36 +0200
Subject: [PATCH 8/8] bpf: fix sg shift repair start offset in
 bpf_msg_pull_data

When we perform the sg shift repair for the scatterlist ring, we
currently start out at i = first_sg + 1. However, this is not
correct since the first_sg could point to the sge sitting at slot
MAX_SKB_FRAGS - 1, and a subsequent i = MAX_SKB_FRAGS will access
the scatterlist ring (sg) out of bounds. Add the sk_msg_iter_var()
helper for iterating through the ring, and apply the same rule
for advancing to the next ring element as we do elsewhere. Later
work will use this helper also in other places.

Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 43ba5f8c38ca..2c7801f6737a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2282,6 +2282,13 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+#define sk_msg_iter_var(var)			\
+	do {					\
+		var++;				\
+		if (var == MAX_SKB_FRAGS)	\
+			var = 0;		\
+	} while (0)
+
 BPF_CALL_4(bpf_msg_pull_data,
 	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
 {
@@ -2302,9 +2309,7 @@ BPF_CALL_4(bpf_msg_pull_data,
 		if (start < offset + len)
 			break;
 		offset += len;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
+		sk_msg_iter_var(i);
 	} while (i != msg->sg_end);
 
 	if (unlikely(start >= offset + len))
@@ -2330,9 +2335,7 @@ BPF_CALL_4(bpf_msg_pull_data,
 	 */
 	do {
 		copy += sg[i].length;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
+		sk_msg_iter_var(i);
 		if (bytes_sg_total <= copy)
 			break;
 	} while (i != msg->sg_end);
@@ -2358,9 +2361,7 @@ BPF_CALL_4(bpf_msg_pull_data,
 		sg[i].length = 0;
 		put_page(sg_page(&sg[i]));
 
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
+		sk_msg_iter_var(i);
 	} while (i != last_sg);
 
 	sg[first_sg].length = copy;
@@ -2377,7 +2378,8 @@ BPF_CALL_4(bpf_msg_pull_data,
 	if (!shift)
 		goto out;
 
-	i = first_sg + 1;
+	i = first_sg;
+	sk_msg_iter_var(i);
 	do {
 		int move_from;
 
@@ -2394,9 +2396,7 @@ BPF_CALL_4(bpf_msg_pull_data,
 		sg[move_from].page_link = 0;
 		sg[move_from].offset = 0;
 
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
+		sk_msg_iter_var(i);
 	} while (1);
 	msg->sg_end -= shift;
 	if (msg->sg_end < 0)