Files
linux/kernel/sched/pelt.h
Valentin Schneider e1fad12dcb sched/fair: Switch to task based throttle model
In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.

Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock when the throttled cfs_rq has no tasks left.

Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250829081120.806-4-ziqianlu@bytedance.com
2025-09-03 10:03:14 +02:00

190 lines
5.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
#ifndef _KERNEL_SCHED_PELT_H
#define _KERNEL_SCHED_PELT_H
#include "sched.h"
#include "sched-pelt.h"
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
bool update_other_load_avgs(struct rq *rq);
#ifdef CONFIG_SCHED_HW_PRESSURE
int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
static inline u64 hw_load_avg(struct rq *rq)
{
return READ_ONCE(rq->avg_hw.load_avg);
}
#else /* !CONFIG_SCHED_HW_PRESSURE: */
static inline int
update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
#endif /* !CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
#endif
#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
static inline u32 get_pelt_divider(struct sched_avg *avg)
{
return PELT_MIN_DIVIDER + avg->period_contrib;
}
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Avoid store if the flag has been already reset */
enqueued = avg->util_est;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est, enqueued);
}
static inline u64 rq_clock_pelt(struct rq *rq)
{
lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time;
}
/* The rq is idle, we can sync to clock_task */
static inline void _update_idle_rq_clock_pelt(struct rq *rq)
{
rq->clock_pelt = rq_clock_task(rq);
u64_u32_store(rq->clock_idle, rq_clock(rq));
/* Paired with smp_rmb in migrate_se_pelt_lag() */
smp_wmb();
u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
}
/*
* The clock_pelt scales the time to reflect the effective amount of
* computation done during the running delta time but then sync back to
* clock_task when rq is idle.
*
*
* absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
* @ max capacity ------******---------------******---------------
* @ half capacity ------************---------************---------
* clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
*
*/
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
_update_idle_rq_clock_pelt(rq);
return;
}
/*
* When a rq runs at a lower compute capacity, it will need
* more time to do the same amount of work than at max
* capacity. In order to be invariant, we scale the delta to
* reflect how much work has been really done.
* Running longer results in stealing idle time that will
* disturb the load signal compared to max capacity. This
* stolen idle time will be automatically reflected when the
* rq will be idle and the clock will be synced with
* rq_clock_task.
*/
/*
* Scale the elapsed time to reflect the real amount of
* computation
*/
delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
rq->clock_pelt += delta;
}
/*
* When rq becomes idle, we have to check if it has lost idle time
* because it was fully busy. A rq is fully used when the /Sum util_sum
* is greater or equal to:
* (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
* For optimization and computing rounding purpose, we don't take into account
* the position in the current window (period_contrib) and we use the higher
* bound of util_sum to decide.
*/
static inline void update_idle_rq_clock_pelt(struct rq *rq)
{
u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
u32 util_sum = rq->cfs.avg.util_sum;
util_sum += rq->avg_rt.util_sum;
util_sum += rq->avg_dl.util_sum;
/*
* Reflecting stolen time makes sense only if the idle
* phase would be present at max capacity. As soon as the
* utilization of a rq has reached the maximum value, it is
* considered as an always running rq without idle time to
* steal. This potential idle time is considered as lost in
* this case. We keep track of this lost idle time compare to
* rq's clock_task.
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
_update_idle_rq_clock_pelt(rq);
}
#ifdef CONFIG_CFS_BANDWIDTH
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
if (unlikely(cfs_rq->pelt_clock_throttled))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->pelt_clock_throttled))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else /* !CONFIG_CFS_BANDWIDTH: */
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
}
#endif /* !CONFIG_CFS_BANDWIDTH */
#endif /* _KERNEL_SCHED_PELT_H */