From e06fa9c16ce4b740996189fa5610eabcee734e6c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:50 +0200 Subject: [PATCH 1/8] bpf, sockmap: fix potential use after free in bpf_tcp_close bpf_tcp_close() we pop the psock linkage to a map via psock_map_pop(). A parallel update on the sock hash map can happen between psock_map_pop() and lookup_elem_raw() where we override the element under link->hash / link->key. In bpf_tcp_close()'s lookup_elem_raw() we subsequently only test whether an element is present, but we do not test whether the element is infact the element we were looking for. We lock the sock in bpf_tcp_close() during that time, so do we hold the lock in sock_hash_update_elem(). However, the latter locks the sock which is newly updated, not the one we're purging from the hash table. This means that while one CPU is doing the lookup from bpf_tcp_close(), another CPU is doing the map update in parallel, dropped our sock from the hlist and released the psock. Subsequently the first CPU will find the new sock and attempts to drop and release the old sock yet another time. Fix is that we need to check the elements for a match after lookup, similar as we do in the sock map. Note that the hash tab elems are freed via RCU, so access to their link->hash / link->key is fine since we're under RCU read side there. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf5195c7c331..01879e4d599a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); From 15c480efab01197c965ce0562a43ffedd852b8f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:51 +0200 Subject: [PATCH 2/8] bpf, sockmap: fix psock refcount leak in bpf_tcp_recvmsg In bpf_tcp_recvmsg() we first took a reference on the psock, however once we find that there are skbs in the normal socket's receive queue we return with processing them through tcp_recvmsg(). Problem is that we leak the taken reference on the psock in that path. Given we don't really do anything with the psock at this point, move the skb_queue_empty() test before we fetch the psock to fix this case. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 01879e4d599a..26d8a3053407 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -912,6 +912,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +924,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { From 3f6e138d41ddff196f452993528cfe75762ede0f Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 27 Aug 2018 21:30:42 +0200 Subject: [PATCH 3/8] bpf: fix build error with clang Building the newly introduced BPF_PROG_TYPE_SK_REUSEPORT leads to a compile time error when building with clang: net/core/filter.o: In function `sk_reuseport_convert_ctx_access': ../net/core/filter.c:7284: undefined reference to `__compiletime_assert_7284' It seems that clang has issues resolving hweight_long at compile time. Since SK_FL_PROTO_MASK is a constant, we can use the interface for known constant arguments which works fine with clang. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Stefan Agner Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index c25eb36f1320..7a2430945c71 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7281,7 +7281,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, BPF_W, 0); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); From 501ca81760c204ec59b73e4a00bee5971fc0f1b1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 24 Aug 2018 17:37:00 -0700 Subject: [PATCH 4/8] bpf: sockmap, decrement copied count correctly in redirect error case Currently, when a redirect occurs in sockmap and an error occurs in the redirect call we unwind the scatterlist once in the error path of bpf_tcp_sendmsg_do_redirect() and then again in sendmsg(). Then in the error path of sendmsg we decrement the copied count by the send size. However, its possible we partially sent data before the error was generated. This can happen if do_tcp_sendpages() partially sends the scatterlist before encountering a memory pressure error. If this happens we need to decrement the copied value (the value tracking how many bytes were actually sent to TCP stack) by the number of remaining bytes _not_ the entire send size. Otherwise we risk confusing userspace. Also we don't need two calls to free the scatterlist one is good enough. So remove the one in bpf_tcp_sendmsg_do_redirect() and then properly reduce copied by the number of remaining bytes which may in fact be the entire send size if no bytes were sent. To do this use bool to indicate if free_start_sg() should do mem accounting or not. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 45 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 26d8a3053407..ce63e5801746 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +820,7 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock, case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +843,17 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock, lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -1121,7 +1120,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1580,13 +1579,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } From 5b24109b0563d45094c470684c1f8cea1af269f8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Aug 2018 16:15:35 +0200 Subject: [PATCH 5/8] bpf: fix several offset tests in bpf_msg_pull_data While recently going over bpf_msg_pull_data(), I noticed three issues which are fixed in here: 1) When we attempt to find the first scatterlist element (sge) for the start offset, we add len to the offset before we check for start < offset + len, whereas it should come after when we iterate to the next sge to accumulate the offsets. For example, given a start offset of 12 with a sge length of 8 for the first sge in the list would lead us to determine this sge as the first sge thinking it covers first 16 bytes where start is located, whereas start sits in subsequent sges so we would end up pulling in the wrong data. 2) After figuring out the starting sge, we have a short-cut test in !msg->sg_copy[i] && bytes <= len. This checks whether it's not needed to make the page at the sge private where we can just exit by updating msg->data and msg->data_end. However, the length test is not fully correct. bytes <= len checks whether the requested bytes (end - start offsets) fit into the sge's length. The part that is missing is that start must not be sge length aligned. Meaning, the start offset into the sge needs to be accounted as well on top of the requested bytes as otherwise we can access the sge out of bounds. For example the sge could have length of 8, our requested bytes could have length of 8, but at a start offset of 4, so we also would need to pull in 4 bytes of the next sge, when we jump to the out label we do set msg->data to sg_virt(&sg[i]) + start - offset and msg->data_end to msg->data + bytes which would be oob. 3) The subsequent bytes < copy test for finding the last sge has the same issue as in point 2) but also it tests for less than rather than less or equal to. Meaning if the sge length is of 8 and requested bytes of 8 while having the start aligned with the sge, we would unnecessarily go and pull in the next sge as well to make it private. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7a2430945c71..ec4d67c0cf0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2286,10 +2286,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { unsigned int len = 0, offset = 0, copy = 0; + int bytes = end - start, bytes_sg_total; struct scatterlist *sg = msg->sg_data; int first_sg, last_sg, i, shift; unsigned char *p, *to, *from; - int bytes = end - start; struct page *page; if (unlikely(flags || end <= start)) @@ -2299,9 +2299,9 @@ BPF_CALL_4(bpf_msg_pull_data, i = msg->sg_start; do { len = sg[i].length; - offset += len; if (start < offset + len) break; + offset += len; i++; if (i == MAX_SKB_FRAGS) i = 0; @@ -2310,7 +2310,11 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(start >= offset + len)) return -EINVAL; - if (!msg->sg_copy[i] && bytes <= len) + /* The start may point into the sg element so we need to also + * account for the headroom. + */ + bytes_sg_total = start - offset + bytes; + if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; first_sg = i; @@ -2330,12 +2334,12 @@ BPF_CALL_4(bpf_msg_pull_data, i++; if (i == MAX_SKB_FRAGS) i = 0; - if (bytes < copy) + if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); last_sg = i; - if (unlikely(copy < end - start)) + if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); From 0e06b227c5221dd51b5569de93f3b9f532be4a32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:34 +0200 Subject: [PATCH 6/8] bpf: fix msg->data/data_end after sg shift repair in bpf_msg_pull_data In the current code, msg->data is set as sg_virt(&sg[i]) + start - offset and msg->data_end relative to it as msg->data + bytes. Using iterator i to point to the updated starting scatterlist element holds true for some cases, however not for all where we'd end up pointing out of bounds. It is /correct/ for these ones: 1) When first finding the starting scatterlist element (sge) where we find that the page is already privately owned by the msg and where the requested bytes and headroom fit into the sge's length. However, it's /incorrect/ for the following ones: 2) After we made the requested area private and updated the newly allocated page into first_sg slot of the scatterlist ring; when we find that no shift repair of the ring is needed where we bail out updating msg->data and msg->data_end. At that point i will point to last_sg, which in this case is the next elem of first_sg in the ring. The sge at that point might as well be invalid (e.g. i == msg->sg_end), which we use for setting the range of sg_virt(&sg[i]). The correct one would have been first_sg. 3) Similar as in 2) but when we find that a shift repair of the ring is needed. In this case we fix up all sges and stop once we've reached the end. In this case i will point to will point to the new msg->sg_end, and the sge at that point will be invalid. Again here the requested range sits in first_sg. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ec4d67c0cf0c..b9225c55926a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2310,6 +2310,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(start >= offset + len)) return -EINVAL; + first_sg = i; /* The start may point into the sg element so we need to also * account for the headroom. */ @@ -2317,8 +2318,6 @@ BPF_CALL_4(bpf_msg_pull_data, if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; - first_sg = i; - /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then @@ -2400,7 +2399,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (msg->sg_end < 0) msg->sg_end += MAX_SKB_FRAGS; out: - msg->data = sg_virt(&sg[i]) + start - offset; + msg->data = sg_virt(&sg[first_sg]) + start - offset; msg->data_end = msg->data + bytes; return 0; From 2e43f95dd8ee62bc8bf57f2afac37fbd70c8d565 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:35 +0200 Subject: [PATCH 7/8] bpf: fix shift upon scatterlist ring wrap-around in bpf_msg_pull_data If first_sg and last_sg wraps around in the scatterlist ring, then we need to account for that in the shift as well. E.g. crafting such msgs where this is the case leads to a hang as shift becomes negative. E.g. consider the following scenario: first_sg := 14 |=> shift := -12 msg->sg_start := 10 last_sg := 3 | msg->sg_end := 5 round 1: i := 15, move_from := 3, sg[15] := sg[ 3] round 2: i := 0, move_from := -12, sg[ 0] := sg[-12] round 3: i := 1, move_from := -11, sg[ 1] := sg[-11] round 4: i := 2, move_from := -10, sg[ 2] := sg[-10] [...] round 13: i := 11, move_from := -1, sg[ 2] := sg[ -1] round 14: i := 12, move_from := 0, sg[ 2] := sg[ 0] round 15: i := 13, move_from := 1, sg[ 2] := sg[ 1] round 16: i := 14, move_from := 2, sg[ 2] := sg[ 2] round 17: i := 15, move_from := 3, sg[ 2] := sg[ 3] [...] This means we will loop forever and never hit the msg->sg_end condition to break out of the loop. When we see that the ring wraps around, then the shift should be MAX_SKB_FRAGS - first_sg + last_sg - 1. Meaning, the remainder slots from the tail of the ring and the head until last_sg combined. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index b9225c55926a..43ba5f8c38ca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2370,7 +2370,10 @@ BPF_CALL_4(bpf_msg_pull_data, * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ - shift = last_sg - first_sg - 1; + WARN_ON_ONCE(last_sg == first_sg); + shift = last_sg > first_sg ? + last_sg - first_sg - 1 : + MAX_SKB_FRAGS - first_sg + last_sg - 1; if (!shift) goto out; From a8cf76a9023bc6709b1361d06bb2fae5227b9d68 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Aug 2018 16:50:36 +0200 Subject: [PATCH 8/8] bpf: fix sg shift repair start offset in bpf_msg_pull_data When we perform the sg shift repair for the scatterlist ring, we currently start out at i = first_sg + 1. However, this is not correct since the first_sg could point to the sge sitting at slot MAX_SKB_FRAGS - 1, and a subsequent i = MAX_SKB_FRAGS will access the scatterlist ring (sg) out of bounds. Add the sk_msg_iter_var() helper for iterating through the ring, and apply the same rule for advancing to the next ring element as we do elsewhere. Later work will use this helper also in other places. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 43ba5f8c38ca..2c7801f6737a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2282,6 +2282,13 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +#define sk_msg_iter_var(var) \ + do { \ + var++; \ + if (var == MAX_SKB_FRAGS) \ + var = 0; \ + } while (0) + BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { @@ -2302,9 +2309,7 @@ BPF_CALL_4(bpf_msg_pull_data, if (start < offset + len) break; offset += len; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (i != msg->sg_end); if (unlikely(start >= offset + len)) @@ -2330,9 +2335,7 @@ BPF_CALL_4(bpf_msg_pull_data, */ do { copy += sg[i].length; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); @@ -2358,9 +2361,7 @@ BPF_CALL_4(bpf_msg_pull_data, sg[i].length = 0; put_page(sg_page(&sg[i])); - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (i != last_sg); sg[first_sg].length = copy; @@ -2377,7 +2378,8 @@ BPF_CALL_4(bpf_msg_pull_data, if (!shift) goto out; - i = first_sg + 1; + i = first_sg; + sk_msg_iter_var(i); do { int move_from; @@ -2394,9 +2396,7 @@ BPF_CALL_4(bpf_msg_pull_data, sg[move_from].page_link = 0; sg[move_from].offset = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; + sk_msg_iter_var(i); } while (1); msg->sg_end -= shift; if (msg->sg_end < 0)