Merge branch 'selftests-drv-net-driver-tests-for-hw-gro'

Jakub Kicinski says: ==================== selftests: drv-net: driver tests for HW GRO Add tests for HW GRO stats, packet ordering and depth. The ynltool and bnxt patches from v2 were applied separately. ==================== Link: https://patch.msgid.link/20260318033819.1469350-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2026-05-16 11:21:26 -04:00 · 2026-03-19 16:57:30 -07:00
parent edab1ca5ec 7dae8ffb09
commit 031f090084
9 changed files with 728 additions and 30 deletions
--- a/tools/testing/selftests/drivers/net/.gitignore
+++ b/tools/testing/selftests/drivers/net/.gitignore
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-gro
 napi_id_helper
 psp_responder
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -6,7 +6,6 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \
 		 ../../net/lib.sh \

 TEST_GEN_FILES := \
-	gro \
 	napi_id_helper \
 # end of TEST_GEN_FILES

--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -35,11 +35,18 @@ Test cases:
  - large_rem: Large packet remainder handling
 """

+import glob
 import os
+import re
 from lib.py import ksft_run, ksft_exit, ksft_pr
 from lib.py import NetDrvEpEnv, KsftXfailEx
+from lib.py import NetdevFamily, EthtoolFamily
 from lib.py import bkg, cmd, defer, ethtool, ip
-from lib.py import ksft_variants
+from lib.py import ksft_variants, KsftNamedVariant
+
+
+# gro.c uses hardcoded DPORT=8000
+GRO_DPORT = 8000


 def _resolve_dmac(cfg, ipver):
@@ -113,11 +120,103 @@ def _set_ethtool_feat(dev, current, feats, host=None):
        ksft_pr(eth_cmd)


+def _get_queue_stats(cfg, queue_id):
+    """Get stats for a specific Rx queue."""
+    cfg.wait_hw_stats_settle()
+    data = cfg.netnl.qstats_get({"ifindex": cfg.ifindex, "scope": ["queue"]},
+                                dump=True)
+    for q in data:
+        if q.get('queue-type') == 'rx' and q.get('queue-id') == queue_id:
+            return q
+    return {}
+
+
+def _setup_isolated_queue(cfg):
+    """Set up an isolated queue for testing using ntuple filter.
+
+    Remove queue 1 from the default RSS context and steer test traffic to it.
+    """
+    test_queue = 1
+
+    qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*"))
+    if qcnt < 2:
+        raise KsftXfailEx(f"Need at least 2 queues, have {qcnt}")
+
+    # Remove queue 1 from default RSS context by setting its weight to 0
+    weights = ["1"] * qcnt
+    weights[test_queue] = "0"
+    ethtool(f"-X {cfg.ifname} weight " + " ".join(weights))
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    # Set up ntuple filter to steer our test traffic to the isolated queue
+    flow  = f"flow-type tcp{cfg.addr_ipver} "
+    flow += f"dst-ip {cfg.addr} dst-port {GRO_DPORT} action {test_queue}"
+    output = ethtool(f"-N {cfg.ifname} {flow}").stdout
+    ntuple_id = int(output.split()[-1])
+    defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
+
+    return test_queue
+
+
+def _setup_queue_count(cfg, num_queues):
+    """Configure the NIC to use a specific number of queues."""
+    channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    ch_max = channels.get('combined-max', 0)
+    qcnt = channels['combined-count']
+
+    if ch_max < num_queues:
+        raise KsftXfailEx(f"Need at least {num_queues} queues, max={ch_max}")
+
+    defer(ethtool, f"-L {cfg.ifname} combined {qcnt}")
+    ethtool(f"-L {cfg.ifname} combined {num_queues}")
+
+
+def _run_gro_bin(cfg, test_name, protocol=None, num_flows=None,
+                 order_check=False, verbose=False, fail=False):
+    """Run gro binary with given test and return the process result."""
+    if not hasattr(cfg, "bin_remote"):
+        cfg.bin_local = cfg.net_lib_dir / "gro"
+        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+
+    if protocol is None:
+        ipver = cfg.addr_ipver
+        protocol = f"ipv{ipver}"
+    else:
+        ipver = "6" if protocol[-1] == "6" else "4"
+
+    dmac = _resolve_dmac(cfg, ipver)
+
+    base_args = [
+        f"--{protocol}",
+        f"--dmac {dmac}",
+        f"--smac {cfg.remote_dev['address']}",
+        f"--daddr {cfg.addr_v[ipver]}",
+        f"--saddr {cfg.remote_addr_v[ipver]}",
+        f"--test {test_name}",
+    ]
+    if num_flows:
+        base_args.append(f"--num-flows {num_flows}")
+    if order_check:
+        base_args.append("--order-check")
+    if verbose:
+        base_args.append("--verbose")
+
+    args = " ".join(base_args)
+
+    rx_cmd = f"{cfg.bin_local} {args} --rx --iface {cfg.ifname}"
+    tx_cmd = f"{cfg.bin_remote} {args} --iface {cfg.remote_ifname}"
+
+    with bkg(rx_cmd, ksft_ready=True, exit_wait=True, fail=fail) as rx_proc:
+        cmd(tx_cmd, host=cfg.remote)
+
+    return rx_proc
+
+
 def _setup(cfg, mode, test_name):
    """ Setup hardware loopback mode for GRO testing. """

    if not hasattr(cfg, "bin_remote"):
-        cfg.bin_local = cfg.test_dir / "gro"
+        cfg.bin_local = cfg.net_lib_dir / "gro"
        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)

    if not hasattr(cfg, "feat"):
@@ -233,30 +332,14 @@ def test(cfg, mode, protocol, test_name):

    _setup(cfg, mode, test_name)

-    base_cmd_args = [
-        f"--{protocol}",
-        f"--dmac {_resolve_dmac(cfg, ipver)}",
-        f"--smac {cfg.remote_dev['address']}",
-        f"--daddr {cfg.addr_v[ipver]}",
-        f"--saddr {cfg.remote_addr_v[ipver]}",
-        f"--test {test_name}",
-        "--verbose"
-    ]
-    base_args = " ".join(base_cmd_args)
-
    # Each test is run 6 times to deflake, because given the receive timing,
    # not all packets that should coalesce will be considered in the same flow
    # on every try.
    max_retries = 6
    for attempt in range(max_retries):
-        rx_cmd = f"{cfg.bin_local} {base_args} --rx --iface {cfg.ifname}"
-        tx_cmd = f"{cfg.bin_remote} {base_args} --iface {cfg.remote_ifname}"
-
        fail_now = attempt >= max_retries - 1
-
-        with bkg(rx_cmd, ksft_ready=True, exit_wait=True,
-                 fail=fail_now) as rx_proc:
-            cmd(tx_cmd, host=cfg.remote)
+        rx_proc = _run_gro_bin(cfg, test_name, protocol=protocol,
+                               verbose=True, fail=fail_now)

        if rx_proc.ret == 0:
            return
@@ -270,11 +353,89 @@ def test(cfg, mode, protocol, test_name):
        ksft_pr(f"Attempt {attempt + 1}/{max_retries} failed, retrying...")


+def _capacity_variants():
+    """Generate variants for capacity test: mode x queue setup."""
+    setups = [
+        ("isolated", _setup_isolated_queue),
+        ("1q", lambda cfg: _setup_queue_count(cfg, 1)),
+        ("8q", lambda cfg: _setup_queue_count(cfg, 8)),
+    ]
+    for mode in ["sw", "hw", "lro"]:
+        for name, func in setups:
+            yield KsftNamedVariant(f"{mode}_{name}", mode, func)
+
+
+@ksft_variants(_capacity_variants())
+def test_gro_capacity(cfg, mode, setup_func):
+    """
+    Probe GRO capacity.
+
+    Start with 8 flows and increase by 2x on each successful run.
+    Retry up to 3 times on failure.
+
+    Variants combine mode (sw, hw, lro) with queue setup:
+      - isolated: Use a single queue isolated from RSS
+      - 1q: Configure NIC to use 1 queue
+      - 8q: Configure NIC to use 8 queues
+    """
+    max_retries = 3
+
+    _setup(cfg, mode, "capacity")
+    queue_id = setup_func(cfg)
+
+    num_flows = 8
+    while True:
+        success = False
+        for attempt in range(max_retries):
+            if queue_id is not None:
+                stats_before = _get_queue_stats(cfg, queue_id)
+
+            rx_proc = _run_gro_bin(cfg, "capacity", num_flows=num_flows)
+            output = rx_proc.stdout
+
+            if queue_id is not None:
+                stats_after = _get_queue_stats(cfg, queue_id)
+                qstat_pkts = (stats_after.get('rx-packets', 0) -
+                              stats_before.get('rx-packets', 0))
+                gro_pkts = (stats_after.get('rx-hw-gro-packets', 0) -
+                            stats_before.get('rx-hw-gro-packets', 0))
+                qstat_str = f" qstat={qstat_pkts} hw-gro={gro_pkts}"
+            else:
+                qstat_str = ""
+
+            # Parse and print STATS line
+            match = re.search(
+                r'STATS: received=(\d+) wire=(\d+) coalesced=(\d+)', output)
+            if match:
+                received = int(match.group(1))
+                wire = int(match.group(2))
+                coalesced = int(match.group(3))
+                status = "PASS" if received == num_flows else "MISS"
+                ksft_pr(f"flows={num_flows} attempt={attempt + 1} "
+                        f"received={received} wire={wire} "
+                        f"coalesced={coalesced}{qstat_str} [{status}]")
+                if received == num_flows:
+                    success = True
+                    break
+            else:
+                ksft_pr(rx_proc)
+                ksft_pr(f"flows={num_flows} attempt={attempt + 1}"
+                        f"{qstat_str} [FAIL - can't parse stats]")
+
+        if not success:
+            ksft_pr(f"Stopped at {num_flows} flows")
+            break
+
+        num_flows *= 2
+
+
 def main() -> None:
    """ Ksft boiler plate main """

    with NetDrvEpEnv(__file__) as cfg:
-        ksft_run(cases=[test], args=(cfg,))
+        cfg.ethnl = EthtoolFamily()
+        cfg.netnl = NetdevFamily()
+        ksft_run(cases=[test, test_gro_capacity], args=(cfg,))
    ksft_exit()


--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -26,6 +26,7 @@ TEST_PROGS = \
 	ethtool_extended_state.sh \
 	ethtool_mm.sh \
 	ethtool_rmon.sh \
+	gro_hw.py \
 	hw_stats_l3.sh \
 	hw_stats_l3_gre.sh \
 	iou-zcrx.py \
--- a/tools/testing/selftests/drivers/net/hw/gro_hw.py
+++ b/tools/testing/selftests/drivers/net/hw/gro_hw.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+HW GRO tests focusing on device machinery like stats, rather than protocol
+processing.
+"""
+
+import glob
+import re
+
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import ksft_eq, ksft_ge, ksft_variants
+from lib.py import NetDrvEpEnv, NetdevFamily
+from lib.py import KsftSkipEx
+from lib.py import bkg, cmd, defer, ethtool, ip
+
+
+# gro.c uses hardcoded DPORT=8000
+GRO_DPORT = 8000
+
+
+def _get_queue_stats(cfg, queue_id):
+    """Get stats for a specific Rx queue."""
+    cfg.wait_hw_stats_settle()
+    data = cfg.netnl.qstats_get({"ifindex": cfg.ifindex, "scope": ["queue"]},
+                                dump=True)
+    for q in data:
+        if q.get('queue-type') == 'rx' and q.get('queue-id') == queue_id:
+            return q
+    return {}
+
+
+def _resolve_dmac(cfg, ipver):
+    """Find the destination MAC address for sending packets."""
+    attr = "dmac" + ipver
+    if hasattr(cfg, attr):
+        return getattr(cfg, attr)
+
+    route = ip(f"-{ipver} route get {cfg.addr_v[ipver]}",
+               json=True, host=cfg.remote)[0]
+    gw = route.get("gateway")
+    if not gw:
+        setattr(cfg, attr, cfg.dev['address'])
+        return getattr(cfg, attr)
+
+    cmd(f"ping -c1 -W0 -I{cfg.remote_ifname} {gw}", host=cfg.remote)
+    neigh = ip(f"neigh get {gw} dev {cfg.remote_ifname}",
+               json=True, host=cfg.remote)[0]
+    setattr(cfg, attr, neigh['lladdr'])
+    return getattr(cfg, attr)
+
+
+def _setup_isolated_queue(cfg):
+    """Set up an isolated queue for testing using ntuple filter.
+
+    Remove queue 1 from the default RSS context and steer test traffic to it.
+    """
+    test_queue = 1
+
+    qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*"))
+    if qcnt < 2:
+        raise KsftSkipEx(f"Need at least 2 queues, have {qcnt}")
+
+    # Remove queue 1 from default RSS context by setting its weight to 0
+    weights = ["1"] * qcnt
+    weights[test_queue] = "0"
+    ethtool(f"-X {cfg.ifname} weight " + " ".join(weights))
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    # Set up ntuple filter to steer our test traffic to the isolated queue
+    flow  = f"flow-type tcp{cfg.addr_ipver} "
+    flow += f"dst-ip {cfg.addr} dst-port {GRO_DPORT} action {test_queue}"
+    output = ethtool(f"-N {cfg.ifname} {flow}").stdout
+    ntuple_id = int(output.split()[-1])
+    defer(ethtool, f"-N {cfg.ifname} delete {ntuple_id}")
+
+    return test_queue
+
+
+def _run_gro_test(cfg, test_name, num_flows=None, ignore_fail=False,
+                  order_check=False):
+    """Run gro binary with given test and return output."""
+    if not hasattr(cfg, "bin_remote"):
+        cfg.bin_local = cfg.net_lib_dir / "gro"
+        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+
+    ipver = cfg.addr_ipver
+    protocol = f"--ipv{ipver}"
+    dmac = _resolve_dmac(cfg, ipver)
+
+    base_args = [
+        protocol,
+        f"--dmac {dmac}",
+        f"--smac {cfg.remote_dev['address']}",
+        f"--daddr {cfg.addr}",
+        f"--saddr {cfg.remote_addr_v[ipver]}",
+        f"--test {test_name}",
+    ]
+    if num_flows:
+        base_args.append(f"--num-flows {num_flows}")
+    if order_check:
+        base_args.append("--order-check")
+
+    args = " ".join(base_args)
+
+    rx_cmd = f"{cfg.bin_local} {args} --rx --iface {cfg.ifname}"
+    tx_cmd = f"{cfg.bin_remote} {args} --iface {cfg.remote_ifname}"
+
+    with bkg(rx_cmd, ksft_ready=True, exit_wait=True, fail=False) as rx_proc:
+        cmd(tx_cmd, host=cfg.remote)
+
+    if not ignore_fail:
+        ksft_eq(rx_proc.ret, 0)
+        if rx_proc.ret != 0:
+            ksft_pr(rx_proc)
+
+    return rx_proc.stdout
+
+
+def _require_hw_gro_stats(cfg, queue_id):
+    """Check if device reports HW GRO stats for the queue."""
+    stats = _get_queue_stats(cfg, queue_id)
+    required = ['rx-packets', 'rx-hw-gro-packets', 'rx-hw-gro-wire-packets']
+    for stat in required:
+        if stat not in stats:
+            raise KsftSkipEx(f"Driver does not report '{stat}' via qstats")
+
+
+def _set_ethtool_feat(cfg, current, feats):
+    """Set ethtool features with defer to restore original state."""
+    s2n = {True: "on", False: "off"}
+
+    new = ["-K", cfg.ifname]
+    old = ["-K", cfg.ifname]
+    no_change = True
+    for name, state in feats.items():
+        new += [name, s2n[state]]
+        old += [name, s2n[current[name]["active"]]]
+
+        if current[name]["active"] != state:
+            no_change = False
+            if current[name]["fixed"]:
+                raise KsftSkipEx(f"Device does not support {name}")
+    if no_change:
+        return
+
+    eth_cmd = ethtool(" ".join(new))
+    defer(ethtool, " ".join(old))
+
+    # If ethtool printed something kernel must have modified some features
+    if eth_cmd.stdout:
+        ksft_pr(eth_cmd)
+
+
+def _setup_hw_gro(cfg):
+    """Enable HW GRO on the device, disabling SW GRO."""
+    feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+
+    # Try to disable SW GRO and enable HW GRO
+    _set_ethtool_feat(cfg, feat,
+                      {"generic-receive-offload": False,
+                       "rx-gro-hw": True,
+                       "large-receive-offload": False})
+
+    # Some NICs treat HW GRO as a GRO sub-feature so disabling GRO
+    # will also clear HW GRO. Use a hack of installing XDP generic
+    # to skip SW GRO, even when enabled.
+    feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+    if not feat["rx-gro-hw"]["active"]:
+        ksft_pr("Driver clears HW GRO when SW GRO is cleared, using generic XDP workaround")
+        prog = cfg.net_lib_dir / "xdp_dummy.bpf.o"
+        ip(f"link set dev {cfg.ifname} xdpgeneric obj {prog} sec xdp")
+        defer(ip, f"link set dev {cfg.ifname} xdpgeneric off")
+
+        # Attaching XDP may change features, fetch the latest state
+        feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+
+        _set_ethtool_feat(cfg, feat,
+                          {"generic-receive-offload": True,
+                           "rx-gro-hw": True,
+                           "large-receive-offload": False})
+
+
+def _check_gro_stats(cfg, test_queue, stats_before,
+                     expect_rx, expect_gro, expect_wire):
+    """Validate GRO stats against expected values."""
+    stats_after = _get_queue_stats(cfg, test_queue)
+
+    rx_delta = (stats_after.get('rx-packets', 0) -
+                stats_before.get('rx-packets', 0))
+    gro_delta = (stats_after.get('rx-hw-gro-packets', 0) -
+                 stats_before.get('rx-hw-gro-packets', 0))
+    wire_delta = (stats_after.get('rx-hw-gro-wire-packets', 0) -
+                  stats_before.get('rx-hw-gro-wire-packets', 0))
+
+    ksft_eq(rx_delta, expect_rx, comment="rx-packets")
+    ksft_eq(gro_delta, expect_gro, comment="rx-hw-gro-packets")
+    ksft_eq(wire_delta, expect_wire, comment="rx-hw-gro-wire-packets")
+
+
+def test_gro_stats_single(cfg):
+    """
+    Test that a single packet doesn't affect GRO stats.
+
+    Send a single packet that cannot be coalesced (nothing to coalesce with).
+    GRO stats should not increase since no coalescing occurred.
+    rx-packets should increase by 2 (1 data + 1 FIN).
+    """
+    _setup_hw_gro(cfg)
+
+    test_queue = _setup_isolated_queue(cfg)
+    _require_hw_gro_stats(cfg, test_queue)
+
+    stats_before = _get_queue_stats(cfg, test_queue)
+
+    _run_gro_test(cfg, "single")
+
+    # 1 data + 1 FIN = 2 rx-packets, no coalescing
+    _check_gro_stats(cfg, test_queue, stats_before,
+                     expect_rx=2, expect_gro=0, expect_wire=0)
+
+
+def test_gro_stats_full(cfg):
+    """
+    Test GRO stats when overwhelming HW GRO capacity.
+
+    Send 500 flows to exceed HW GRO flow capacity on a single queue.
+    This should result in some packets not being coalesced.
+    Validate that qstats match what gro.c observed.
+    """
+    _setup_hw_gro(cfg)
+
+    test_queue = _setup_isolated_queue(cfg)
+    _require_hw_gro_stats(cfg, test_queue)
+
+    num_flows = 500
+    stats_before = _get_queue_stats(cfg, test_queue)
+
+    # Run capacity test - will likely fail because not all packets coalesce
+    output = _run_gro_test(cfg, "capacity", num_flows=num_flows,
+                           ignore_fail=True)
+
+    # Parse gro.c output: "STATS: received=X wire=Y coalesced=Z"
+    match = re.search(r'STATS: received=(\d+) wire=(\d+) coalesced=(\d+)',
+                      output)
+    if not match:
+        raise KsftSkipEx(f"Could not parse gro.c output: {output}")
+
+    rx_frames = int(match.group(2))
+    gro_coalesced = int(match.group(3))
+
+    ksft_ge(gro_coalesced, 1,
+            comment="At least some packets should coalesce")
+
+    # received + 1 FIN, coalesced super-packets, coalesced * 2 wire packets
+    _check_gro_stats(cfg, test_queue, stats_before,
+                     expect_rx=rx_frames + 1,
+                     expect_gro=gro_coalesced,
+                     expect_wire=gro_coalesced * 2)
+
+
+@ksft_variants([4, 32, 512])
+def test_gro_order(cfg, num_flows):
+    """
+    Test that HW GRO preserves packet ordering between flows.
+
+    Packets may get delayed until the aggregate is released,
+    but reordering between aggregates and packet terminating
+    the aggregate and normal packets should not happen.
+
+    Note that this test is stricter than truly required.
+    Reordering packets between flows should not cause issues.
+    This test will also fail if traffic is run over an ECMP fabric.
+    """
+    _setup_hw_gro(cfg)
+    _setup_isolated_queue(cfg)
+
+    _run_gro_test(cfg, "capacity", num_flows=num_flows, order_check=True)
+
+
+def main() -> None:
+    """ Ksft boiler plate main """
+
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+        cfg.netnl = NetdevFamily()
+        ksft_run([test_gro_stats_single,
+                  test_gro_stats_full,
+                  test_gro_order], args=(cfg,))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -288,8 +288,8 @@ class NetDrvEpEnv(NetDrvEnvBase):
                if "Operation not supported" not in e.cmd.stderr:
                    raise

-            self._stats_settle_time = 0.025 + \
-                data.get('stats-block-usecs', 0) / 1000 / 1000
+            self._stats_settle_time = \
+                1.25 * data.get('stats-block-usecs', 20000) / 1000 / 1000

        time.sleep(self._stats_settle_time)

--- a/tools/testing/selftests/net/lib/.gitignore
+++ b/tools/testing/selftests/net/lib/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 csum
+gro
 xdp_helper
--- a/tools/testing/selftests/net/lib/Makefile
+++ b/tools/testing/selftests/net/lib/Makefile
@@ -14,6 +14,7 @@ TEST_FILES := \
 TEST_GEN_FILES := \
 	$(patsubst %.c,%.o,$(wildcard *.bpf.c)) \
 	csum \
+	gro \
 	xdp_helper \
 # end of TEST_GEN_FILES

--- a/tools/testing/selftests/drivers/net/gro.c
+++ b/tools/testing/selftests/drivers/net/gro.c
@@ -43,6 +43,10 @@
 *   - large_max: exceeding max size
 *   - large_rem: remainder handling
 *
+ * single, capacity:
+ *  Boring cases used to test coalescing machinery itself and stats
+ *  more than protocol behavior.
+ *
 * MSS is defined as 4096 - header because if it is too small
 * (i.e. 1500 MTU - header), it will result in many packets,
 * increasing the "large" test case's flakiness. This is because
@@ -63,6 +67,7 @@
 #include <linux/filter.h>
 #include <linux/if_packet.h>
 #include <linux/ipv6.h>
+#include <linux/net_tstamp.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <netinet/in.h>
@@ -74,10 +79,11 @@
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
+#include <time.h>
 #include <unistd.h>

 #include "kselftest.h"
-#include "../../net/lib/ksft.h"
+#include "ksft.h"

 #define DPORT 8000
 #define SPORT 1500
@@ -123,6 +129,13 @@ static int tcp_offset = -1;
 static int total_hdr_len = -1;
 static int ethhdr_proto = -1;
 static bool ipip;
+static uint64_t txtime_ns;
+static int num_flows = 4;
+static bool order_check;
+
+#define CAPACITY_PAYLOAD_LEN 200
+
+#define TXTIME_DELAY_MS 5

 static void vlog(const char *fmt, ...)
 {
@@ -330,13 +343,37 @@ static void fill_transportlayer(void *buf, int seq_offset, int ack_offset,

 static void write_packet(int fd, char *buf, int len, struct sockaddr_ll *daddr)
 {
+	char control[CMSG_SPACE(sizeof(uint64_t))];
+	struct msghdr msg = {};
+	struct iovec iov = {};
+	struct cmsghdr *cm;
 	int ret = -1;

-	ret = sendto(fd, buf, len, 0, (struct sockaddr *)daddr, sizeof(*daddr));
+	iov.iov_base = buf;
+	iov.iov_len = len;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = daddr;
+	msg.msg_namelen = sizeof(*daddr);
+
+	if (txtime_ns) {
+		memset(control, 0, sizeof(control));
+		msg.msg_control = control;
+		msg.msg_controllen = sizeof(control);
+
+		cm = CMSG_FIRSTHDR(&msg);
+		cm->cmsg_level = SOL_SOCKET;
+		cm->cmsg_type = SCM_TXTIME;
+		cm->cmsg_len = CMSG_LEN(sizeof(uint64_t));
+		memcpy(CMSG_DATA(cm), &txtime_ns, sizeof(txtime_ns));
+	}
+
+	ret = sendmsg(fd, &msg, 0);
 	if (ret == -1)
-		error(1, errno, "sendto failure");
+		error(1, errno, "sendmsg failure");
 	if (ret != len)
-		error(1, errno, "sendto wrong length");
+		error(1, 0, "sendmsg wrong length: %d vs %d", ret, len);
 }

 static void create_packet(void *buf, int seq_offset, int ack_offset,
@@ -360,6 +397,45 @@ static void create_packet(void *buf, int seq_offset, int ack_offset,
 	fill_datalinklayer(buf);
 }

+static void create_capacity_packet(void *buf, int flow_id, int pkt_idx, int psh)
+{
+	int seq_offset = pkt_idx * CAPACITY_PAYLOAD_LEN;
+	struct tcphdr *tcph;
+
+	create_packet(buf, seq_offset, 0, CAPACITY_PAYLOAD_LEN, 0);
+
+	/* Customize for this flow id */
+	memset(buf + total_hdr_len, 'a' + flow_id, CAPACITY_PAYLOAD_LEN);
+
+	tcph = buf + tcp_offset;
+	tcph->source = htons(SPORT + flow_id);
+	tcph->psh = psh;
+	tcph->check = 0;
+	tcph->check = tcp_checksum(tcph, CAPACITY_PAYLOAD_LEN);
+}
+
+/* Send a capacity test, 2 packets per flow, all first packets then all second:
+ *  A1 B1 C1 D1 ... A2 B2 C2 D2 ...
+ */
+static void send_capacity(int fd, struct sockaddr_ll *daddr)
+{
+	static char buf[MAX_HDR_LEN + CAPACITY_PAYLOAD_LEN];
+	int pkt_size = total_hdr_len + CAPACITY_PAYLOAD_LEN;
+	int i;
+
+	/* Send first packet of each flow (no PSH) */
+	for (i = 0; i < num_flows; i++) {
+		create_capacity_packet(buf, i, 0, 0);
+		write_packet(fd, buf, pkt_size, daddr);
+	}
+
+	/* Send second packet of each flow (with PSH to flush) */
+	for (i = 0; i < num_flows; i++) {
+		create_capacity_packet(buf, i, 1, 1);
+		write_packet(fd, buf, pkt_size, daddr);
+	}
+}
+
 #ifndef TH_CWR
 #define TH_CWR 0x80
 #endif
@@ -1056,8 +1132,123 @@ static void check_recv_pkts(int fd, int *correct_payload,
 	printf("Test succeeded\n\n");
 }

+static void check_capacity_pkts(int fd)
+{
+	static char buffer[IP_MAXPACKET + ETH_HLEN + 1];
+	struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN);
+	struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN);
+	int num_pkt = 0, num_coal = 0, pkt_idx;
+	const char *fail_reason = NULL;
+	int flow_order[num_flows * 2];
+	int coalesced[num_flows];
+	struct tcphdr *tcph;
+	int ip_ext_len = 0;
+	int total_data = 0;
+	int pkt_size = -1;
+	int data_len = 0;
+	int flow_id;
+	int sport;
+
+	memset(coalesced, 0, sizeof(coalesced));
+	memset(flow_order, -1, sizeof(flow_order));
+
+	while (total_data < num_flows * CAPACITY_PAYLOAD_LEN * 2) {
+		ip_ext_len = 0;
+		pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0);
+		if (pkt_size < 0)
+			recv_error(fd, errno);
+
+		if (iph->version == 4)
+			ip_ext_len = (iph->ihl - 5) * 4;
+		else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP)
+			ip_ext_len = MIN_EXTHDR_SIZE;
+
+		tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len);
+
+		/* FIN packet terminates reception */
+		if (tcph->fin)
+			break;
+
+		sport = ntohs(tcph->source);
+		flow_id = sport - SPORT;
+
+		if (flow_id < 0 || flow_id >= num_flows) {
+			vlog("Invalid flow_id %d from sport %d\n",
+			     flow_id, sport);
+			fail_reason = fail_reason ?: "invalid packet";
+			continue;
+		}
+
+		/* Calculate payload length */
+		if (pkt_size == ETH_ZLEN && iph->version == 4) {
+			data_len = ntohs(iph->tot_len)
+				- sizeof(struct tcphdr) - sizeof(struct iphdr);
+		} else {
+			data_len = pkt_size - total_hdr_len - ip_ext_len;
+		}
+
+		flow_order[num_pkt] = flow_id;
+		coalesced[flow_id] = data_len;
+
+		if (data_len == CAPACITY_PAYLOAD_LEN * 2) {
+			num_coal++;
+		} else {
+			vlog("Pkt %d: flow %d, sport %d, len %d (expected %d)\n",
+			     num_pkt, flow_id, sport, data_len,
+			     CAPACITY_PAYLOAD_LEN * 2);
+			fail_reason = fail_reason ?: "not coalesced";
+		}
+
+		num_pkt++;
+		total_data += data_len;
+	}
+
+	/* Check flow ordering. We expect to see all non-coalesced first segs
+	 * then interleaved coalesced and non-coalesced second frames.
+	 */
+	pkt_idx = 0;
+	for (flow_id = 0; order_check && flow_id < num_flows; flow_id++) {
+		bool coaled = coalesced[flow_id] > CAPACITY_PAYLOAD_LEN;
+
+		if (coaled)
+			continue;
+
+		if (flow_order[pkt_idx] != flow_id) {
+			vlog("Flow order mismatch (non-coalesced) at position %d: expected flow %d, got flow %d\n",
+			     pkt_idx, flow_id, flow_order[pkt_idx]);
+			fail_reason = fail_reason ?: "bad packet order (1)";
+		}
+		pkt_idx++;
+	}
+	for (flow_id = 0; order_check && flow_id < num_flows; flow_id++) {
+		bool coaled = coalesced[flow_id] > CAPACITY_PAYLOAD_LEN;
+
+		if (flow_order[pkt_idx] != flow_id) {
+			vlog("Flow order mismatch at position %d: expected flow %d, got flow %d, coalesced: %d\n",
+			     pkt_idx, flow_id, flow_order[pkt_idx], coaled);
+			fail_reason = fail_reason ?: "bad packet order (2)";
+		}
+		pkt_idx++;
+	}
+
+	if (!fail_reason) {
+		vlog("All %d flows coalesced correctly\n", num_flows);
+		printf("Test succeeded\n\n");
+	} else {
+		printf("FAILED\n");
+	}
+
+	/* Always print stats for external validation */
+	printf("STATS: received=%d wire=%d coalesced=%d\n",
+	       num_pkt, num_pkt + num_coal, num_coal);
+
+	if (fail_reason)
+		error(1, 0, "capacity test failed %s", fail_reason);
+}
+
 static void gro_sender(void)
 {
+	int bufsize = 4 * 1024 * 1024; /* 4 MB */
 	const int fin_delay_us = 100 * 1000;
 	static char fin_pkt[MAX_HDR_LEN];
 	struct sockaddr_ll daddr = {};
@@ -1067,6 +1258,27 @@ static void gro_sender(void)
 	if (txfd < 0)
 		error(1, errno, "socket creation");

+	if (setsockopt(txfd, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)))
+		error(1, errno, "cannot set sndbuf size, setsockopt failed");
+
+	/* Enable SO_TXTIME unless test case generates more than one flow
+	 * SO_TXTIME could result in qdisc layer sorting the packets at sender.
+	 */
+	if (strcmp(testname, "single") && strcmp(testname, "capacity")) {
+		struct sock_txtime so_txtime = { .clockid = CLOCK_MONOTONIC, };
+		struct timespec ts;
+
+		if (setsockopt(txfd, SOL_SOCKET, SO_TXTIME,
+			       &so_txtime, sizeof(so_txtime)))
+			error(1, errno, "setsockopt SO_TXTIME");
+
+		if (clock_gettime(CLOCK_MONOTONIC, &ts))
+			error(1, errno, "clock_gettime");
+
+		txtime_ns = ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+		txtime_ns += TXTIME_DELAY_MS * 1000000ULL;
+	}
+
 	memset(&daddr, 0, sizeof(daddr));
 	daddr.sll_ifindex = if_nametoindex(ifname);
 	if (daddr.sll_ifindex == 0)
@@ -1199,6 +1411,19 @@ static void gro_sender(void)

 		send_large(txfd, &daddr, remainder + 1);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+	/* machinery sub-tests */
+	} else if (strcmp(testname, "single") == 0) {
+		static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
+
+		create_packet(buf, 0, 0, PAYLOAD_LEN, 0);
+		write_packet(txfd, buf, total_hdr_len + PAYLOAD_LEN, &daddr);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "capacity") == 0) {
+		send_capacity(txfd, &daddr);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
 	} else {
 		error(1, 0, "Unknown testcase: %s", testname);
 	}
@@ -1394,6 +1619,15 @@ static void gro_receiver(void)
 		correct_payload[2] = remainder + 1;
 		printf("last segment sent individually: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
+
+	/* machinery sub-tests */
+	} else if (strcmp(testname, "single") == 0) {
+		printf("single data packet: ");
+		correct_payload[0] = PAYLOAD_LEN;
+		check_recv_pkts(rxfd, correct_payload, 1);
+	} else if (strcmp(testname, "capacity") == 0) {
+		check_capacity_pkts(rxfd);
+
 	} else {
 		error(1, 0, "Test case error: unknown testname %s", testname);
 	}
@@ -1411,16 +1645,18 @@ static void parse_args(int argc, char **argv)
 		{ "ipv4", no_argument, NULL, '4' },
 		{ "ipv6", no_argument, NULL, '6' },
 		{ "ipip", no_argument, NULL, 'e' },
+		{ "num-flows", required_argument, NULL, 'n' },
 		{ "rx", no_argument, NULL, 'r' },
 		{ "saddr", required_argument, NULL, 's' },
 		{ "smac", required_argument, NULL, 'S' },
 		{ "test", required_argument, NULL, 't' },
+		{ "order-check", no_argument, NULL, 'o' },
 		{ "verbose", no_argument, NULL, 'v' },
 		{ 0, 0, 0, 0 }
 	};
 	int c;

-	while ((c = getopt_long(argc, argv, "46d:D:ei:rs:S:t:v", opts, NULL)) != -1) {
+	while ((c = getopt_long(argc, argv, "46d:D:ei:n:rs:S:t:ov", opts, NULL)) != -1) {
 		switch (c) {
 		case '4':
 			proto = PF_INET;
@@ -1444,6 +1680,9 @@ static void parse_args(int argc, char **argv)
 		case 'i':
 			ifname = optarg;
 			break;
+		case 'n':
+			num_flows = atoi(optarg);
+			break;
 		case 'r':
 			tx_socket = false;
 			break;
@@ -1456,6 +1695,9 @@ static void parse_args(int argc, char **argv)
 		case 't':
 			testname = optarg;
 			break;
+		case 'o':
+			order_check = true;
+			break;
 		case 'v':
 			verbose = true;
 			break;