From 1e08511d5d01884a3c9070afd52a47799312074a Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:12 -0700 Subject: [PATCH 1/8] igb: clear out skb->tstamp after reading the txtime If a packet which is utilizing the launchtime feature (via SO_TXTIME socket option) also requests the hardware transmit timestamp, the hardware timestamp is not delivered to the userspace. This is because the value in skb->tstamp is mistaken as the software timestamp. Applications, like ptp4l, request a hardware timestamp by setting the SOF_TIMESTAMPING_TX_HARDWARE socket option. Whenever a new timestamp is detected by the driver (this work is done in igb_ptp_tx_work() which calls igb_ptp_tx_hwtstamps() in igb_ptp.c[1]), it will queue the timestamp in the ERR_QUEUE for the userspace to read. When the userspace is ready, it will issue a recvmsg() call to collect this timestamp. The problem is in this recvmsg() call. If the skb->tstamp is not cleared out, it will be interpreted as a software timestamp and the hardware tx timestamp will not be successfully sent to the userspace. Look at skb_is_swtx_tstamp() and the callee function __sock_recv_timestamp() in net/socket.c for more details. Signed-off-by: Vedang Patel Tested-by: Aaron Brown Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fc925adbd9fa..f66dae72fe37 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -5688,6 +5688,7 @@ static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, */ if (tx_ring->launchtime_enable) { ts = ns_to_timespec64(first->skb->tstamp); + first->skb->tstamp = 0; context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32); } else { context_desc->seqnum_seed = 0; From 9903c8dc734265689d5770ff28c84a7228fe5890 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:13 -0700 Subject: [PATCH 2/8] etf: Don't use BIT() in UAPI headers. The BIT() macro isn't exported as part of the UAPI interface. So, the compile-test to ensure they are self contained fails. So, use _BITUL() instead. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8b2f993cbb77..f88c4e0bd9e5 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -988,8 +988,8 @@ struct tc_etf_qopt { __s32 delta; __s32 clockid; __u32 flags; -#define TC_ETF_DEADLINE_MODE_ON BIT(0) -#define TC_ETF_OFFLOAD_ON BIT(1) +#define TC_ETF_DEADLINE_MODE_ON _BITUL(0) +#define TC_ETF_OFFLOAD_ON _BITUL(1) }; enum { From d14d2b20680f02fa739c2cbbb59e3629e487f359 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:14 -0700 Subject: [PATCH 3/8] etf: Add skip_sock_check Currently, etf expects a socket with SO_TXTIME option set for each packet it encounters. So, it will drop all other packets. But, in the future commits we are planning to add functionality where tstamp value will be set by another qdisc. Also, some packets which are generated from within the kernel (e.g. ICMP packets) do not have any socket associated with them. So, this commit adds support for skip_sock_check. When this option is set, etf will skip checking for a socket and other associated options for all skbs. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_etf.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index f88c4e0bd9e5..127ac6d2888c 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -990,6 +990,7 @@ struct tc_etf_qopt { __u32 flags; #define TC_ETF_DEADLINE_MODE_ON _BITUL(0) #define TC_ETF_OFFLOAD_ON _BITUL(1) +#define TC_ETF_SKIP_SOCK_CHECK _BITUL(2) }; enum { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index db0c2ba1d156..cebfb65d8556 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -22,10 +22,12 @@ #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) +#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; + bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ @@ -77,6 +79,9 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) struct sock *sk = nskb->sk; ktime_t now; + if (q->skip_sock_check) + goto skip; + if (!sk) return false; @@ -92,6 +97,7 @@ static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; +skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; @@ -385,6 +391,7 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: @@ -473,6 +480,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; + if (q->skip_sock_check) + opt.flags |= TC_ETF_SKIP_SOCK_CHECK; + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; From 037be0374078e205ca802ca8716dabb6064f940e Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:15 -0700 Subject: [PATCH 4/8] taprio: calculate cycle_time when schedule is installed cycle time for a particular schedule is calculated only when it is first installed. So, it makes sense to just calculate it once right after the 'cycle_time' parameter has been parsed and store it in cycle_time. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9ecfb8f5902a..a41d7d4434ee 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -108,22 +108,6 @@ static void switch_schedules(struct taprio_sched *q, *admin = NULL; } -static ktime_t get_cycle_time(struct sched_gate_list *sched) -{ - struct sched_entry *entry; - ktime_t cycle = 0; - - if (sched->cycle_time != 0) - return sched->cycle_time; - - list_for_each_entry(entry, &sched->entries, list) - cycle = ktime_add_ns(cycle, entry->interval); - - sched->cycle_time = cycle; - - return cycle; -} - static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -524,6 +508,15 @@ static int parse_taprio_schedule(struct nlattr **tb, if (err < 0) return err; + if (!new->cycle_time) { + struct sched_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &new->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + new->cycle_time = cycle; + } + return 0; } @@ -605,7 +598,7 @@ static int taprio_get_start_time(struct Qdisc *sch, return 0; } - cycle = get_cycle_time(sched); + cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, @@ -632,7 +625,7 @@ static void setup_first_close_time(struct taprio_sched *q, first = list_first_entry(&sched->entries, struct sched_entry, list); - cycle = get_cycle_time(sched); + cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_close_time = ktime_add_ns(base, cycle); From 566af331b5a6ae3969cf212c2077457d4963b0d1 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:16 -0700 Subject: [PATCH 5/8] taprio: Remove inline directive Remove inline directive from length_to_duration(). We will let the compiler make the decisions. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a41d7d4434ee..6ef0cc03fdb9 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -168,7 +168,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) return NULL; } -static inline int length_to_duration(struct taprio_sched *q, int len) +static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); } From 4cfd5779bd6efe8c76b4494aec63a063be0d2ff2 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:17 -0700 Subject: [PATCH 6/8] taprio: Add support for txtime-assist mode Currently, we are seeing non-critical packets being transmitted outside of their timeslice. We can confirm that the packets are being dequeued at the right time. So, the delay is induced in the hardware side. The most likely reason is the hardware queues are starving the lower priority queues. In order to improve the performance of taprio, we will be making use of the txtime feature provided by the ETF qdisc. For all the packets which do not have the SO_TXTIME option set, taprio will set the transmit timestamp (set in skb->tstamp) in this mode. TAPrio Qdisc will ensure that the transmit time for the packet is set to when the gate is open. If SO_TXTIME is set, the TAPrio qdisc will validate whether the timestamp (in skb->tstamp) occurs when the gate corresponding to skb's traffic class is open. Following two parameters added to support this mode: - flags: used to enable txtime-assist mode. Will also be used to enable other modes (like hardware offloading) later. - txtime-delay: This indicates the minimum time it will take for the packet to hit the wire. This is useful in determining whether we can transmit the packet in the remaining time if the gate corresponding to the packet is currently open. An example configuration for enabling txtime-assist: tc qdisc replace dev eth0 parent root handle 100 taprio \\ num_tc 3 \\ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\ queues 1@0 1@0 1@0 \\ base-time 1558653424279842568 \\ sched-entry S 01 300000 \\ sched-entry S 02 300000 \\ sched-entry S 04 400000 \\ flags 0x1 \\ txtime-delay 40000 \\ clockid CLOCK_TAI tc qdisc replace dev $IFACE parent 100:1 etf skip_sock_check \\ offload delta 200000 clockid CLOCK_TAI Note that all the traffic classes are mapped to the same queue. This is only possible in taprio when txtime-assist is enabled. Also, note that the ETF Qdisc is enabled with offload mode set. In this mode, if the packet's traffic class is open and the complete packet can be transmitted, taprio will try to transmit the packet immediately. This will be done by setting skb->tstamp to current_time + the time delta indicated in the txtime-delay parameter. This parameter indicates the time taken (in software) for packet to reach the network adapter. If the packet cannot be transmitted in the current interval or if the packet's traffic is not currently transmitting, the skb->tstamp is set to the next available timestamp value. This is tracked in the next_launchtime parameter in the struct sched_entry. The behaviour w.r.t admin and oper schedules is not changed from what is present in software mode. The transmit time is already known in advance. So, we do not need the HR timers to advance the schedule and wakeup the dequeue side of taprio. So, HR timer won't be run when this mode is enabled. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 4 + net/sched/sch_taprio.c | 343 +++++++++++++++++++++++++++++++-- 2 files changed, 329 insertions(+), 18 deletions(-) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 127ac6d2888c..390efb54b2e0 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1159,6 +1159,8 @@ enum { * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] */ +#define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST 0x1 + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ @@ -1170,6 +1172,8 @@ enum { TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ + TCA_TAPRIO_ATTR_FLAGS, /* u32 */ + TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 6ef0cc03fdb9..078230e44471 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -21,12 +21,16 @@ #include #include #include +#include static LIST_HEAD(taprio_list); static DEFINE_SPINLOCK(taprio_list_lock); #define TAPRIO_ALL_GATES_OPEN -1 +#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)) +#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) + struct sched_entry { struct list_head list; @@ -35,6 +39,7 @@ struct sched_entry { * packet leaves after this time. */ ktime_t close_time; + ktime_t next_txtime; atomic_t budget; int index; u32 gate_mask; @@ -55,6 +60,7 @@ struct sched_gate_list { struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; + u32 flags; int clockid; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte @@ -68,6 +74,7 @@ struct taprio_sched { ktime_t (*get_time)(void); struct hrtimer advance_timer; struct list_head taprio_list; + int txtime_delay; }; static ktime_t sched_base_time(const struct sched_gate_list *sched) @@ -108,6 +115,227 @@ static void switch_schedules(struct taprio_sched *q, *admin = NULL; } +/* Get how much time has been already elapsed in the current cycle. */ +static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) +{ + ktime_t time_since_sched_start; + s32 time_elapsed; + + time_since_sched_start = ktime_sub(time, sched->base_time); + div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); + + return time_elapsed; +} + +static ktime_t get_interval_end_time(struct sched_gate_list *sched, + struct sched_gate_list *admin, + struct sched_entry *entry, + ktime_t intv_start) +{ + s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); + ktime_t intv_end, cycle_ext_end, cycle_end; + + cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); + intv_end = ktime_add_ns(intv_start, entry->interval); + cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); + + if (ktime_before(intv_end, cycle_end)) + return intv_end; + else if (admin && admin != sched && + ktime_after(admin->base_time, cycle_end) && + ktime_before(admin->base_time, cycle_ext_end)) + return admin->base_time; + else + return cycle_end; +} + +static int length_to_duration(struct taprio_sched *q, int len) +{ + return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); +} + +/* Returns the entry corresponding to next available interval. If + * validate_interval is set, it only validates whether the timestamp occurs + * when the gate corresponding to the skb's traffic class is open. + */ +static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, + struct Qdisc *sch, + struct sched_gate_list *sched, + struct sched_gate_list *admin, + ktime_t time, + ktime_t *interval_start, + ktime_t *interval_end, + bool validate_interval) +{ + ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; + ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; + struct sched_entry *entry = NULL, *entry_found = NULL; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + bool entry_available = false; + s32 cycle_elapsed; + int tc, n; + + tc = netdev_get_prio_tc_map(dev, skb->priority); + packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); + + *interval_start = 0; + *interval_end = 0; + + if (!sched) + return NULL; + + cycle = sched->cycle_time; + cycle_elapsed = get_cycle_time_elapsed(sched, time); + curr_intv_end = ktime_sub_ns(time, cycle_elapsed); + cycle_end = ktime_add_ns(curr_intv_end, cycle); + + list_for_each_entry(entry, &sched->entries, list) { + curr_intv_start = curr_intv_end; + curr_intv_end = get_interval_end_time(sched, admin, entry, + curr_intv_start); + + if (ktime_after(curr_intv_start, cycle_end)) + break; + + if (!(entry->gate_mask & BIT(tc)) || + packet_transmit_time > entry->interval) + continue; + + txtime = entry->next_txtime; + + if (ktime_before(txtime, time) || validate_interval) { + transmit_end_time = ktime_add_ns(time, packet_transmit_time); + if ((ktime_before(curr_intv_start, time) && + ktime_before(transmit_end_time, curr_intv_end)) || + (ktime_after(curr_intv_start, time) && !validate_interval)) { + entry_found = entry; + *interval_start = curr_intv_start; + *interval_end = curr_intv_end; + break; + } else if (!entry_available && !validate_interval) { + /* Here, we are just trying to find out the + * first available interval in the next cycle. + */ + entry_available = 1; + entry_found = entry; + *interval_start = ktime_add_ns(curr_intv_start, cycle); + *interval_end = ktime_add_ns(curr_intv_end, cycle); + } + } else if (ktime_before(txtime, earliest_txtime) && + !entry_available) { + earliest_txtime = txtime; + entry_found = entry; + n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); + *interval_start = ktime_add(curr_intv_start, n * cycle); + *interval_end = ktime_add(curr_intv_end, n * cycle); + } + } + + return entry_found; +} + +static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t interval_start, interval_end; + struct sched_entry *entry; + + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + + entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, + &interval_start, &interval_end, true); + rcu_read_unlock(); + + return entry; +} + +/* There are a few scenarios where we will have to modify the txtime from + * what is read from next_txtime in sched_entry. They are: + * 1. If txtime is in the past, + * a. The gate for the traffic class is currently open and packet can be + * transmitted before it closes, schedule the packet right away. + * b. If the gate corresponding to the traffic class is going to open later + * in the cycle, set the txtime of packet to the interval start. + * 2. If txtime is in the future, there are packets corresponding to the + * current traffic class waiting to be transmitted. So, the following + * possibilities exist: + * a. We can transmit the packet before the window containing the txtime + * closes. + * b. The window might close before the transmission can be completed + * successfully. So, schedule the packet in the next open window. + */ +static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) +{ + ktime_t transmit_end_time, interval_end, interval_start; + struct taprio_sched *q = qdisc_priv(sch); + struct sched_gate_list *sched, *admin; + ktime_t minimum_time, now, txtime; + int len, packet_transmit_time; + struct sched_entry *entry; + bool sched_changed; + + now = q->get_time(); + minimum_time = ktime_add_ns(now, q->txtime_delay); + + rcu_read_lock(); + admin = rcu_dereference(q->admin_sched); + sched = rcu_dereference(q->oper_sched); + if (admin && ktime_after(minimum_time, admin->base_time)) + switch_schedules(q, &admin, &sched); + + /* Until the schedule starts, all the queues are open */ + if (!sched || ktime_before(minimum_time, sched->base_time)) { + txtime = minimum_time; + goto done; + } + + len = qdisc_pkt_len(skb); + packet_transmit_time = length_to_duration(q, len); + + do { + sched_changed = 0; + + entry = find_entry_to_transmit(skb, sch, sched, admin, + minimum_time, + &interval_start, &interval_end, + false); + if (!entry) { + txtime = 0; + goto done; + } + + txtime = entry->next_txtime; + txtime = max_t(ktime_t, txtime, minimum_time); + txtime = max_t(ktime_t, txtime, interval_start); + + if (admin && admin != sched && + ktime_after(txtime, admin->base_time)) { + sched = admin; + sched_changed = 1; + continue; + } + + transmit_end_time = ktime_add(txtime, packet_transmit_time); + minimum_time = transmit_end_time; + + /* Update the txtime of current entry to the next time it's + * interval starts. + */ + if (ktime_after(transmit_end_time, interval_end)) + entry->next_txtime = ktime_add(interval_start, sched->cycle_time); + } while (sched_changed || ktime_after(transmit_end_time, interval_end)); + + entry->next_txtime = transmit_end_time; + +done: + rcu_read_unlock(); + return txtime; +} + static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -121,6 +349,15 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); + if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) { + if (!is_valid_interval(skb, sch)) + return qdisc_drop(skb, sch, to_free); + } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb->tstamp = get_packet_txtime(skb, sch); + if (!skb->tstamp) + return qdisc_drop(skb, sch, to_free); + } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; @@ -156,6 +393,9 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) if (!skb) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + return skb; + prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); @@ -168,11 +408,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) return NULL; } -static int length_to_duration(struct taprio_sched *q, int len) -{ - return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); -} - static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -216,6 +451,13 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!child)) continue; + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { + skb = child->ops->dequeue(child); + if (!skb) + continue; + goto skb_found; + } + skb = child->ops->peek(child); if (!skb) continue; @@ -246,6 +488,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto done; +skb_found: qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; @@ -522,7 +765,8 @@ static int parse_taprio_schedule(struct nlattr **tb, static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + u32 taprio_flags) { int i, j; @@ -570,6 +814,9 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return -EINVAL; } + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) + continue; + /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (last > qopt->offset[j]) { @@ -700,6 +947,18 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; } +static void setup_txtime(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) +{ + struct sched_entry *entry; + u32 interval = 0; + + list_for_each_entry(entry, &sched->entries, list) { + entry->next_txtime = ktime_add_ns(base, interval); + interval += entry->interval; + } +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -708,6 +967,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; + u32 taprio_flags = 0; int i, err, clockid; unsigned long flags; ktime_t start; @@ -720,7 +980,21 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (tb[TCA_TAPRIO_ATTR_FLAGS]) { + taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]); + + if (q->flags != 0 && q->flags != taprio_flags) { + NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } else if (!FLAGS_VALID(taprio_flags)) { + NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); + return -EINVAL; + } + + q->flags = taprio_flags; + } + + err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags); if (err < 0) return err; @@ -779,7 +1053,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); - if (!hrtimer_active(&q->advance_timer)) { + if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { + if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { + NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); + err = -EINVAL; + goto unlock; + } + + q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); + } + + if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && + !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } @@ -822,20 +1107,35 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - setup_first_close_time(q, new_admin, start); + if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) { + setup_txtime(q, new_admin, start); - /* Protects against advance_sched() */ - spin_lock_irqsave(&q->current_entry_lock, flags); + if (!oper) { + rcu_assign_pointer(q->oper_sched, new_admin); + err = 0; + new_admin = NULL; + goto unlock; + } - taprio_start_sched(sch, start, new_admin); + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + } else { + setup_first_close_time(q, new_admin, start); + + /* Protects against advance_sched() */ + spin_lock_irqsave(&q->current_entry_lock, flags); + + taprio_start_sched(sch, start, new_admin); + + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + } - rcu_assign_pointer(q->admin_sched, new_admin); - if (admin) - call_rcu(&admin->rcu, taprio_free_sched_cb); new_admin = NULL; - - spin_unlock_irqrestore(&q->current_entry_lock, flags); - err = 0; unlock: @@ -1073,6 +1373,13 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; + if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) + goto options_error; + + if (q->txtime_delay && + nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) + goto options_error; + if (oper && dump_schedule(skb, oper)) goto options_error; From 7ede7b03484bbb035aa5be98c45a40cfabdc0738 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:18 -0700 Subject: [PATCH 7/8] taprio: make clock reference conversions easier Later in this series we will need to transform from CLOCK_MONOTONIC (used in TCP) to the clock reference used in TAPRIO. Signed-off-by: Vinicius Costa Gomes Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 078230e44471..5c5e7db520dc 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -61,6 +61,7 @@ struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; + enum tk_offsets tk_offset; int clockid; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte @@ -71,7 +72,6 @@ struct taprio_sched { struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; - ktime_t (*get_time)(void); struct hrtimer advance_timer; struct list_head taprio_list; int txtime_delay; @@ -85,6 +85,20 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched) return ns_to_ktime(sched->base_time); } +static ktime_t taprio_get_time(struct taprio_sched *q) +{ + ktime_t mono = ktime_get(); + + switch (q->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, q->tk_offset); + } + + return KTIME_MAX; +} + static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); @@ -278,7 +292,7 @@ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) struct sched_entry *entry; bool sched_changed; - now = q->get_time(); + now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); rcu_read_lock(); @@ -469,7 +483,7 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) continue; len = qdisc_pkt_len(skb); - guard = ktime_add_ns(q->get_time(), + guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no @@ -838,7 +852,7 @@ static int taprio_get_start_time(struct Qdisc *sch, s64 n; base = sched_base_time(sched); - now = q->get_time(); + now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; @@ -1084,16 +1098,16 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, switch (q->clockid) { case CLOCK_REALTIME: - q->get_time = ktime_get_real; + q->tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: - q->get_time = ktime_get; + q->tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: - q->get_time = ktime_get_boottime; + q->tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: - q->get_time = ktime_get_clocktai; + q->tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); From 54002066100b6d2f731157156c41d853e0c9137e Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 25 Jun 2019 15:07:19 -0700 Subject: [PATCH 8/8] taprio: Adjust timestamps for TCP packets When the taprio qdisc is running in "txtime offload" mode, it will set the launchtime value (in skb->tstamp) for all the packets which do not have the SO_TXTIME socket option. But, the TCP packets already have this value set and it indicates the earliest departure time represented in CLOCK_MONOTONIC clock. We need to respect the timestamp set by the TCP subsystem. So, convert this time to the clock which taprio is using and ensure that the packet is not transmitted before the deadline set by TCP. Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 5c5e7db520dc..388750ddc57a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -22,6 +22,7 @@ #include #include #include +#include static LIST_HEAD(taprio_list); static DEFINE_SPINLOCK(taprio_list_lock); @@ -267,6 +268,41 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) return entry; } +/* This returns the tstamp value set by TCP in terms of the set clock. */ +static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; + + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + if (!ipv6h) + return 0; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return 0; + } else if (iph->protocol != IPPROTO_TCP) { + return 0; + } + } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { + return 0; + } + + return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset); +} + /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, @@ -284,7 +320,7 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { - ktime_t transmit_end_time, interval_end, interval_start; + ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; @@ -295,6 +331,9 @@ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); + tcp_tstamp = get_tcp_tstamp(q, skb); + minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); + rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched);