From 10154dbded6d6a2fecaebdfda206609de0f121a9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 26 Jun 2024 19:51:26 +0200 Subject: [PATCH 1/2] udp: Allow GSO transmit from devices with no checksum offload Today sending a UDP GSO packet from a TUN device results in an EIO error: import fcntl, os, struct from socket import * TUNSETIFF = 0x400454CA IFF_TUN = 0x0001 IFF_NO_PI = 0x1000 UDP_SEGMENT = 103 tun_fd = os.open("/dev/net/tun", os.O_RDWR) ifr = struct.pack("16sH", b"tun0", IFF_TUN | IFF_NO_PI) fcntl.ioctl(tun_fd, TUNSETIFF, ifr) os.system("ip addr add 192.0.2.1/24 dev tun0") os.system("ip link set dev tun0 up") s = socket(AF_INET, SOCK_DGRAM) s.setsockopt(SOL_UDP, UDP_SEGMENT, 1200) s.sendto(b"x" * 3000, ("192.0.2.2", 9)) # EIO This is due to a check in the udp stack if the egress device offers checksum offload. While TUN/TAP devices, by default, don't advertise this capability because it requires support from the TUN/TAP reader. However, the GSO stack has a software fallback for checksum calculation, which we can use. This way we don't force UDP_SEGMENT users to handle the EIO error and implement a segmentation fallback. Lift the restriction so that UDP_SEGMENT can be used with any egress device. We also need to adjust the UDP GSO code to match the GSO stack expectation about ip_summed field, as set in commit 8d63bee643f1 ("net: avoid skb_warn_bad_offload false positives on UFO"). Otherwise we will hit the bad offload check. Users should, however, expect a potential performance impact when batch-sending packets with UDP_SEGMENT without checksum offload on the egress device. In such case the packet payload is read twice: first during the sendmsg syscall when copying data from user memory, and then in the GSO stack for checksum computation. This double memory read can be less efficient than a regular sendmsg where the checksum is calculated during the initial data copy from user memory. Signed-off-by: Jakub Sitnicki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240626-linux-udpgso-v2-1-422dfcbd6b48@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 3 +-- net/ipv4/udp_offload.c | 8 ++++++++ net/ipv6/udp.c | 3 +-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d08bf16d476d..ed97df6af14d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -938,8 +938,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, kfree_skb(skb); return -EINVAL; } - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) { + if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 59448a2dbf2c..aa2e0a28ca61 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -357,6 +357,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, else uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; + /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same + * meaning. However, check for bad offloads in the GSO stack expects the + * latter, if the checksum was calculated in software. To vouch for the + * segment skbs we actually need to set it on the gso_skb. + */ + if (gso_skb->ip_summed == CHECKSUM_NONE) + gso_skb->ip_summed = CHECKSUM_UNNECESSARY; + /* update refcount for the packet */ if (copy_dtor) { int delta = sum_truesize - gso_skb->truesize; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b56f0b9f4307..b5456394cc67 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1257,8 +1257,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EINVAL; } - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) { + if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } From 3e400219c04dcfcada224303c8efa2c91beaae0b Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 26 Jun 2024 19:51:27 +0200 Subject: [PATCH 2/2] selftests/net: Add test coverage for UDP GSO software fallback Extend the existing test to exercise UDP GSO egress through devices with various offload capabilities, including lack of checksum offload, which is the default case for TUN/TAP devices. Test against a dummy device because it is simpler to set up then TUN/TAP. Signed-off-by: Jakub Sitnicki Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20240626-linux-udpgso-v2-2-422dfcbd6b48@cloudflare.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgso.c | 15 ++++++++-- tools/testing/selftests/net/udpgso.sh | 43 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 85b3baa3f7f3..3e74cfa1a2bf 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -53,6 +53,7 @@ static bool cfg_do_ipv6; static bool cfg_do_connected; static bool cfg_do_connectionless; static bool cfg_do_msgmore; +static bool cfg_do_recv = true; static bool cfg_do_setsockopt; static int cfg_specific_test_id = -1; @@ -414,6 +415,9 @@ static void run_one(struct testcase *test, int fdt, int fdr, if (!sent) return; + if (!cfg_do_recv) + return; + if (test->gso_len) mss = test->gso_len; else @@ -464,8 +468,10 @@ static void run_test(struct sockaddr *addr, socklen_t alen) if (fdr == -1) error(1, errno, "socket r"); - if (bind(fdr, addr, alen)) - error(1, errno, "bind"); + if (cfg_do_recv) { + if (bind(fdr, addr, alen)) + error(1, errno, "bind"); + } /* Have tests fail quickly instead of hang */ if (setsockopt(fdr, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) @@ -524,7 +530,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "46cCmst:")) != -1) { + while ((c = getopt(argc, argv, "46cCmRst:")) != -1) { switch (c) { case '4': cfg_do_ipv4 = true; @@ -541,6 +547,9 @@ static void parse_opts(int argc, char **argv) case 'm': cfg_do_msgmore = true; break; + case 'R': + cfg_do_recv = false; + break; case 's': cfg_do_setsockopt = true; break; diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh index 6c63178086b0..85d1fa3c1ff7 100755 --- a/tools/testing/selftests/net/udpgso.sh +++ b/tools/testing/selftests/net/udpgso.sh @@ -27,6 +27,31 @@ test_route_mtu() { ip route add local fd00::1/128 table local dev lo mtu 1500 } +setup_dummy_sink() { + ip link add name sink mtu 1500 type dummy + ip addr add dev sink 10.0.0.0/24 + ip addr add dev sink fd00::2/64 nodad + ip link set dev sink up +} + +test_hw_gso_hw_csum() { + setup_dummy_sink + ethtool -K sink tx-checksum-ip-generic on >/dev/null + ethtool -K sink tx-udp-segmentation on >/dev/null +} + +test_sw_gso_hw_csum() { + setup_dummy_sink + ethtool -K sink tx-checksum-ip-generic on >/dev/null + ethtool -K sink tx-udp-segmentation off >/dev/null +} + +test_sw_gso_sw_csum() { + setup_dummy_sink + ethtool -K sink tx-checksum-ip-generic off >/dev/null + ethtool -K sink tx-udp-segmentation off >/dev/null +} + if [ "$#" -gt 0 ]; then "$1" shift 2 # pop "test_*" arg and "--" delimiter @@ -56,3 +81,21 @@ echo "ipv4 msg_more" echo "ipv6 msg_more" ./in_netns.sh "$0" test_dev_mtu -- ./udpgso -6 -C -m + +echo "ipv4 hw-gso hw-csum" +./in_netns.sh "$0" test_hw_gso_hw_csum -- ./udpgso -4 -C -R + +echo "ipv6 hw-gso hw-csum" +./in_netns.sh "$0" test_hw_gso_hw_csum -- ./udpgso -6 -C -R + +echo "ipv4 sw-gso hw-csum" +./in_netns.sh "$0" test_sw_gso_hw_csum -- ./udpgso -4 -C -R + +echo "ipv6 sw-gso hw-csum" +./in_netns.sh "$0" test_sw_gso_hw_csum -- ./udpgso -6 -C -R + +echo "ipv4 sw-gso sw-csum" +./in_netns.sh "$0" test_sw_gso_sw_csum -- ./udpgso -4 -C -R + +echo "ipv6 sw-gso sw-csum" +./in_netns.sh "$0" test_sw_gso_sw_csum -- ./udpgso -6 -C -R