From c8cbe123be6de9deff5e5312af8848362a919f97 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:40 +0300 Subject: [PATCH 1/7] net/sched: taprio: taprio_offload_config_changed() is protected by rtnl_mutex The locking in taprio_offload_config_changed() is wrong (but also inconsequentially so). The current_entry_lock does not serialize changes to the admin and oper schedules, only to the current entry. In fact, the rtnl_mutex does that, and that is taken at the time when taprio_change() is called. Replace the rcu_dereference_protected() method with the proper RCU annotation, and drop the unnecessary spin lock. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index db88a692ef81..e9f57ef7bc17 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1193,16 +1193,10 @@ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; - spin_lock(&q->current_entry_lock); - - oper = rcu_dereference_protected(q->oper_sched, - lockdep_is_held(&q->current_entry_lock)); - admin = rcu_dereference_protected(q->admin_sched, - lockdep_is_held(&q->current_entry_lock)); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); - - spin_unlock(&q->current_entry_lock); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) From 18cdd2f0998a4967b1fff4c43ed9aef049e42c39 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:41 +0300 Subject: [PATCH 2/7] net/sched: taprio: taprio_dump and taprio_change are protected by rtnl_mutex Since the writer-side lock is taken here, we do not need to open an RCU read-side critical section, instead we can use rtnl_dereference() to tell lockdep we are serialized with concurrent writes. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e9f57ef7bc17..017ccf5431aa 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1484,10 +1484,8 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, } INIT_LIST_HEAD(&new_admin->entries); - rcu_read_lock(); - oper = rcu_dereference(q->oper_sched); - admin = rcu_dereference(q->admin_sched); - rcu_read_unlock(); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) @@ -1878,9 +1876,8 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest, *sched_nest; unsigned int i; - rcu_read_lock(); - oper = rcu_dereference(q->oper_sched); - admin = rcu_dereference(q->admin_sched); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); @@ -1924,8 +1921,6 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_nest_end(skb, sched_nest); done: - rcu_read_unlock(); - return nla_nest_end(skb, nest); admin_error: @@ -1935,7 +1930,6 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_nest_cancel(skb, nest); start_error: - rcu_read_unlock(); return -ENOSPC; } From 9af23657b33679b5b8d8579ca1cc0214398f576f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:42 +0300 Subject: [PATCH 3/7] net/sched: taprio: use rtnl_dereference for oper and admin sched in taprio_destroy() Sparse complains that taprio_destroy() dereferences q->oper_sched and q->admin_sched without rcu_dereference(), since they are marked as __rcu in the taprio private structure. 1671:28: warning: incorrect type in argument 1 (different address spaces) 1671:28: expected struct callback_head *head 1671:28: got struct callback_head [noderef] __rcu * 1674:28: warning: incorrect type in argument 1 (different address spaces) 1674:28: expected struct callback_head *head 1674:28: got struct callback_head [noderef] __rcu * To silence that build warning, do actually use rtnl_dereference(), since we know the rtnl_mutex is held at the time of q->destroy(). Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 017ccf5431aa..38d742b335d0 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1634,6 +1634,7 @@ static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *oper, *admin; unsigned int i; spin_lock(&taprio_list_lock); @@ -1657,11 +1658,14 @@ static void taprio_destroy(struct Qdisc *sch) netdev_reset_tc(dev); - if (q->oper_sched) - call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); - if (q->admin_sched) - call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb); + if (oper) + call_rcu(&oper->rcu, taprio_free_sched_cb); + + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, From fa65edde5e490988bfb8945317dd8e546bd7e7ab Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:43 +0300 Subject: [PATCH 4/7] net/sched: taprio: remove redundant FULL_OFFLOAD_IS_ENABLED check in taprio_enqueue Since commit 13511704f8d7 ("net: taprio offload: enforce qdisc to netdev queue mapping"), __dev_queue_xmit() will select a txq->qdisc for the full offload case of taprio which isn't the root taprio qdisc, so qdisc enqueues will never pass through taprio_enqueue(). That commit already introduced one safety precaution check for FULL_OFFLOAD_IS_ENABLED(); a second one is really not needed, so simplify the conditional for entering into the GSO segmentation logic. Also reword the comment a little, to appear more natural after the code change. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 38d742b335d0..17265ee930d4 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -454,10 +454,10 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Large packets might not be transmitted when the transmission duration * exceeds any configured interval. Therefore, segment the skb into - * smaller chunks. Skip it for the full offload case, as the driver - * and/or the hardware is expected to handle this. + * smaller chunks. Drivers with full offload are expected to handle + * this in hardware. */ - if (skb_is_gso(skb) && !FULL_OFFLOAD_IS_ENABLED(q->flags)) { + if (skb_is_gso(skb)) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; From 25becba6290bc34e369a0e1a76db9ca88bad87aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:44 +0300 Subject: [PATCH 5/7] net/sched: taprio: stop going through private ops for dequeue and peek Since commit 13511704f8d7 ("net: taprio offload: enforce qdisc to netdev queue mapping"), taprio_dequeue_soft() and taprio_peek_soft() are de facto the only implementations for Qdisc_ops :: dequeue and Qdisc_ops :: peek that taprio provides. This is because in full offload mode, __dev_queue_xmit() will select a txq->qdisc which is never root taprio qdisc. So if nothing is enqueued in the root qdisc, it will never be run and nothing will get dequeued from it. Therefore, we can remove the private indirection from taprio, and always point Qdisc_ops :: dequeue to taprio_dequeue_soft (now simply named taprio_dequeue) and Qdisc_ops :: peek to taprio_peek_soft (now simply named taprio_peek). Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 58 +++++++++--------------------------------- 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 17265ee930d4..2f45dfb259c9 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -78,8 +78,6 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - struct sk_buff *(*dequeue)(struct Qdisc *sch); - struct sk_buff *(*peek)(struct Qdisc *sch); u32 txtime_delay; }; @@ -491,7 +489,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, return taprio_enqueue_one(skb, sch, child, to_free); } -static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) +static struct sk_buff *taprio_peek(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -500,6 +498,11 @@ static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) u32 gate_mask; int i; + if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { + WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); + return NULL; + } + rcu_read_lock(); entry = rcu_dereference(q->current_entry); gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; @@ -535,20 +538,6 @@ static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) return NULL; } -static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) -{ - WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); - - return NULL; -} - -static struct sk_buff *taprio_peek(struct Qdisc *sch) -{ - struct taprio_sched *q = qdisc_priv(sch); - - return q->peek(sch); -} - static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -556,7 +545,7 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) atomic64_read(&q->picos_per_byte))); } -static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -565,6 +554,11 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) u32 gate_mask; int i; + if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { + WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); + return NULL; + } + rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't @@ -644,20 +638,6 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) return skb; } -static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) -{ - WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); - - return NULL; -} - -static struct sk_buff *taprio_dequeue(struct Qdisc *sch) -{ - struct taprio_sched *q = qdisc_priv(sch); - - return q->dequeue(sch); -} - static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { @@ -1555,17 +1535,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->advance_timer.function = advance_sched; } - if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - q->dequeue = taprio_dequeue_offload; - q->peek = taprio_peek_offload; - } else { - /* Be sure to always keep the function pointers - * in a consistent state. - */ - q->dequeue = taprio_dequeue_soft; - q->peek = taprio_peek_soft; - } - err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); @@ -1680,9 +1649,6 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; - q->dequeue = taprio_dequeue_soft; - q->peek = taprio_peek_soft; - q->root = sch; /* We only support static clockids. Use an invalid value as default From 026de64d7bc39cc77f2084c4454a562720e9c8ff Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:45 +0300 Subject: [PATCH 6/7] net/sched: taprio: add extack messages in taprio_init Stop contributing to the proverbial user unfriendliness of tc, and tell the user what is wrong wherever possible. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2f45dfb259c9..2552f62f392d 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1661,11 +1661,15 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, list_add(&q->taprio_list, &taprio_list); spin_unlock(&taprio_list_lock); - if (sch->parent != TC_H_ROOT) + if (sch->parent != TC_H_ROOT) { + NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; + } - if (!netif_is_multiqueue(dev)) + if (!netif_is_multiqueue(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; + } /* pre-allocate qdisc, attachment can't fail */ q->qdiscs = kcalloc(dev->num_tx_queues, From 2c08a4f898d0a8e08f431709a1ae728a6fddaabd Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:50:46 +0300 Subject: [PATCH 7/7] net/sched: taprio: replace safety precautions with comments The WARN_ON_ONCE() checks introduced in commit 13511704f8d7 ("net: taprio offload: enforce qdisc to netdev queue mapping") take a small toll on performance, but otherwise, the conditions are never expected to happen. Replace them with comments, such that the information is still conveyed to developers. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2552f62f392d..b72c373edea0 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -432,6 +432,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, return qdisc_enqueue(skb, child, to_free); } +/* Will not be called in the full offload case, since the TX queues are + * attached to the Qdisc created using qdisc_create_dflt() + */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -439,11 +442,6 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child; int queue; - if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { - WARN_ONCE(1, "Trying to enqueue skb into the root of a taprio qdisc configured with full offload\n"); - return qdisc_drop(skb, sch, to_free); - } - queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; @@ -489,6 +487,9 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, return taprio_enqueue_one(skb, sch, child, to_free); } +/* Will not be called in the full offload case, since the TX queues are + * attached to the Qdisc created using qdisc_create_dflt() + */ static struct sk_buff *taprio_peek(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); @@ -498,11 +499,6 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) u32 gate_mask; int i; - if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { - WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); - return NULL; - } - rcu_read_lock(); entry = rcu_dereference(q->current_entry); gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; @@ -545,6 +541,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) atomic64_read(&q->picos_per_byte))); } +/* Will not be called in the full offload case, since the TX queues are + * attached to the Qdisc created using qdisc_create_dflt() + */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); @@ -554,11 +553,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) u32 gate_mask; int i; - if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { - WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); - return NULL; - } - rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't